diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/bench.slurm deleted file mode 100644 index 0ca19dbb7e5ca9ca4ed8c3819c1e9e4827b4147d..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/config.yaml b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/config.yaml deleted file mode 100644 index cefcb6f56cb331be656c47a460abdc0993808d4c..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 4 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 1024 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 1 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/log.out b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/log.out deleted file mode 100644 index d7f55f36a66eafab8ef0fa87f851a71d05e5f89a..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/log.out +++ /dev/null @@ -1,5696 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:37:23 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:37:26.398000 140508561835840 torch/distributed/run.py:757] -W0703 09:37:26.398000 140508561835840 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.398000 140508561835840 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:37:26.398000 140508561835840 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.396000 140224669833024 torch/distributed/run.py:757] -W0703 09:37:26.396000 140224669833024 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.396000 140224669833024 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:37:26.396000 140224669833024 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.407000 139984102647616 torch/distributed/run.py:757] -W0703 09:37:26.407000 139984102647616 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.407000 139984102647616 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:37:26.407000 139984102647616 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.418000 139711592036160 torch/distributed/run.py:757] -W0703 09:37:26.418000 139711592036160 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.418000 139711592036160 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:37:26.418000 139711592036160 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.424000 139670798600000 torch/distributed/run.py:757] -W0703 09:37:26.424000 139670798600000 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.424000 139670798600000 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:37:26.424000 139670798600000 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.445000 139886570223424 torch/distributed/run.py:757] -W0703 09:37:26.445000 139886570223424 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.445000 139886570223424 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:37:26.445000 139886570223424 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.449000 140488241452864 torch/distributed/run.py:757] -W0703 09:37:26.449000 140488241452864 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.449000 140488241452864 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:37:26.449000 140488241452864 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.528000 140214361397056 torch/distributed/run.py:757] -W0703 09:37:26.528000 140214361397056 torch/distributed/run.py:757] ***************************************** -W0703 09:37:26.528000 140214361397056 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:37:26.528000 140214361397056 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:37:46 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=4, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=1024, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1')), -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 09:37:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=6|ip-26-0-165-24]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=6|ip-26-0-165-24]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=6|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=1|ip-26-0-165-24]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=1|ip-26-0-165-24]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=1|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=8|ip-26-0-166-125]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=8|ip-26-0-166-125]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=8|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=15|ip-26-0-166-125]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=4|ip-26-0-165-24]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=15|ip-26-0-166-125]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=14|ip-26-0-166-125]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=2|ip-26-0-165-24]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=14|ip-26-0-166-125]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=14|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=15|ip-26-0-166-125]: No checkpoint path provided. -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=2|ip-26-0-165-24]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=2|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=4|ip-26-0-165-24]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=4|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=3|ip-26-0-165-24]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=3|ip-26-0-165-24]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=5|ip-26-0-165-24]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=5|ip-26-0-165-24]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=3|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=5|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=10|ip-26-0-166-125]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=10|ip-26-0-166-125]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=10|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=13|ip-26-0-166-125]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=13|ip-26-0-166-125]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=13|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=9|ip-26-0-166-125]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=9|ip-26-0-166-125]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=9|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=12|ip-26-0-166-125]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=12|ip-26-0-166-125]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=12|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=7|ip-26-0-163-147]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=7|ip-26-0-163-147]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=7|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=2|ip-26-0-163-147]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=2|ip-26-0-163-147]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=3|ip-26-0-163-147]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=8|ip-26-0-164-207]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=8|ip-26-0-164-207]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=3|ip-26-0-163-147]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=4|ip-26-0-163-147]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=2|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=3|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=6|ip-26-0-163-147]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=8|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=14|ip-26-0-164-207]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=12|ip-26-0-164-207]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=12|ip-26-0-164-207]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=12|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=14|ip-26-0-164-207]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=14|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=11|ip-26-0-164-207]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=11|ip-26-0-164-207]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=11|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=6|ip-26-0-163-147]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=5|ip-26-0-163-147]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=0|ip-26-0-163-147]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=4|ip-26-0-163-147]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=5|ip-26-0-163-147]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=5|ip-26-0-163-147]: No checkpoint path provided. -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=6|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=4|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=0|ip-26-0-163-147]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=0|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=1|ip-26-0-163-147]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=1|ip-26-0-163-147]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=1|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=11|ip-26-0-166-125]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=11|ip-26-0-166-125]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=11|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=9|ip-26-0-164-207]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=10|ip-26-0-164-207]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=9|ip-26-0-164-207]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=9|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=10|ip-26-0-164-207]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=10|ip-26-0-164-207]: No checkpoint path provided. -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=13|ip-26-0-164-207]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=13|ip-26-0-164-207]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=13|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=15|ip-26-0-164-207]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=15|ip-26-0-164-207]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=2|TP=15|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=0|ip-26-0-165-24]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=0|ip-26-0-165-24]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=0|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=7|ip-26-0-165-24]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=7|ip-26-0-165-24]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 09:38:03 [INFO|DP=0|PP=3|TP=7|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 09:38:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:38:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:38:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 24.8M out of 24.8M (100.00%) params' optimizer states -[default0]:07/03/2024 09:38:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:38:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 09:38:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:38:07 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:38:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:38:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:38:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 09:38:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 09:38:09.192616 | mbs: 1 | grad_accum: 1024 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:38:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:38:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 244.38MiB. Peak allocated 244.38MiB. Peak reserved: 266.00MiB -[default3]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=7|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=15|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=13|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=8|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=14|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=9|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=11|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=10|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=12|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=14|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=8|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=6|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=15|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=1|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=2|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=10|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=3|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=5|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=4|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=2|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=4|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=0|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=6|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=7|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=3|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=12|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=9|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=7|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=2|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=4|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=6|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=5|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=3|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=11|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=12|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=14|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=1|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=11|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=8|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=1|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:38:09 [WARNING|DP=0|PP=1|TP=5|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=13|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=10|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=13|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=9|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:38:09 [WARNING|DP=0|PP=2|TP=15|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:38:09 [WARNING|DP=0|PP=3|TP=0|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:38:09 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank10]: send_activation() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank10]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank10]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank10]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank10]: dist.send( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank10]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank13]: send_activation() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank13]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank13]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank13]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank13]: dist.send( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank13]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank31]: send_activation() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank31]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank31]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank31]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank31]: dist.send( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank31]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank15]: send_activation() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank15]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank15]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank15]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank15]: dist.send( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank15]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank29]: send_activation() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank29]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank29]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank29]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank29]: dist.send( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank29]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank25]: send_activation() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank25]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank25]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank25]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank25]: dist.send( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank25]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank12]: send_activation() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank12]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank12]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank12]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank12]: dist.send( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank12]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank17]: send_activation() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank17]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank17]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank17]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank17]: dist.send( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank17]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank22]: send_activation() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank22]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank22]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank22]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank22]: dist.send( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank22]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank20]: send_activation() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank20]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank20]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank20]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank20]: dist.send( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank20]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: result = loss.backward() -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank19]: send_activation() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: _engine_run_backward( -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default2]:[rank18]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank21]: send_activation() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default5]:[rank21]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank19]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank19]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank21]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank21]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: dist.send( -[default2]:[rank18]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank19]: dist.send( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default3]:[rank19]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank18]: send_activation() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank18]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank18]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank21]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank18]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: dist.send( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank18]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank6]: send_activation() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank6]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank6]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank6]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank6]: dist.send( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank6]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank5]: send_activation() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank5]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank5]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank5]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank5]: dist.send( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank5]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank4]: send_activation() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank4]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank4]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank4]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank4]: dist.send( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank4]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank7]: send_activation() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank7]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank7]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank7]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank7]: dist.send( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank7]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank28]: send_activation() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank28]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank28]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank28]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank28]: dist.send( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank28]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank30]: send_activation() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank30]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank30]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank30]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank30]: dist.send( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank30]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f253c3a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f253d67ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f253d683a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f253d684dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f258911de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f258e164609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f258df2f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f253c3a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f253d67ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f253d683a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f253d684dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f258911de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f258e164609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f258df2f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f253c3a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f253d308119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f258911de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f258e164609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f258df2f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank3]: send_activation() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank3]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank3]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank3]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank3]: dist.send( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank3]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank1]: send_activation() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank1]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank1]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank1]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank1]: dist.send( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank1]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f153bed1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f153d1aac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f153d1afa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f153d1b0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1588c49e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f158dc90609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f158da5b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f153bed1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f153d1aac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f153d1afa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f153d1b0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1588c49e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f158dc90609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f158da5b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f153bed1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f153ce34119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f1588c49e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f158dc90609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f158da5b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank0]: send_activation() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank0]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank0]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank0]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank0]: dist.send( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank0]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank24]: send_activation() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank24]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank24]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank24]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank24]: dist.send( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank24]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank26]: send_activation() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank26]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank26]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank26]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank26]: dist.send( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank26]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank11]: send_activation() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank11]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank11]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank11]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank11]: dist.send( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank11]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank8]: send_activation() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank8]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank8]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank8]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank8]: dist.send( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank8]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank14]: send_activation() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank14]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank14]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank14]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank14]: dist.send( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank14]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank9]: send_activation() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank9]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank9]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank9]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank9]: dist.send( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank9]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa7b53d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa7b66b0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa7b66b5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa7b66b6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa80214fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa807196609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa806f61353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa7b53d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa7b66b0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa7b66b5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa7b66b6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa80214fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa807196609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa806f61353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa7b53d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fa7b633a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fa80214fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fa807196609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fa806f61353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24f8cf3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24f9fccc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24f9fd1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24f9fd2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2545a6be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f254aab2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f254a87d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24f8cf3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24f9fccc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24f9fd1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24f9fd2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2545a6be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f254aab2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f254a87d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24f8cf3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f24f9c56119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f2545a6be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f254aab2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f254a87d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank23]: send_activation() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank23]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank23]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank23]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank23]: dist.send( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank23]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank16]: send_activation() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank16]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank16]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank16]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank16]: dist.send( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank16]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank27]: send_activation() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank27]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank27]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank27]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank27]: dist.send( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank27]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank2]: send_activation() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank2]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank2]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank2]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank2]: dist.send( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank2]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f337c755897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f337da2ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f337da33a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f337da34dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f33c94cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f33ce514609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f33ce2df353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f337c755897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f337da2ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f337da33a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f337da34dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f33c94cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f33ce514609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f33ce2df353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f337c755897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f337d6b8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f33c94cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f33ce514609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f33ce2df353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa120a80897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa121d59c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa121d5ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa121d5fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa16d7f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa17283f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa17260a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa120a80897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa121d59c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa121d5ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa121d5fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa16d7f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa17283f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa17260a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa120a80897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fa1219e3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fa16d7f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fa17283f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fa17260a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f432c315897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f432d5eec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f432d5f3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f432d5f4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f437908de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f437e0d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f437de9f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f432c315897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f432d5eec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f432d5f3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f432d5f4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f437908de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f437e0d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f437de9f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f432c315897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f432d278119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f437908de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f437e0d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f437de9f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f75aa9ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f75abc84c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f75abc89a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f75abc8adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f75f7723e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f75fc76a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f75fc535353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f75aa9ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f75abc84c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f75abc89a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f75abc8adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f75f7723e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f75fc76a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f75fc535353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f75aa9ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f75ab90e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f75f7723e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f75fc76a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f75fc535353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f025194c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0252c25c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0252c2aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0252c2bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f029e6c4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f02a370b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f02a34d6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f025194c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0252c25c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0252c2aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0252c2bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f029e6c4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f02a370b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f02a34d6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f025194c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f02528af119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f029e6c4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f02a370b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f02a34d6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a4df74897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3a4f24dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3a4f252a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3a4f253dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3a9acece95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3a9fd33609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3a9fafe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a4df74897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3a4f24dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3a4f252a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3a4f253dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3a9acece95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3a9fd33609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3a9fafe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a4df74897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f3a4eed7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f3a9acece95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f3a9fd33609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f3a9fafe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72c9490897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f72ca769c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f72ca76ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f72ca76fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7316208e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f731b24f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f731b01a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72c9490897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f72ca769c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f72ca76ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f72ca76fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7316208e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f731b24f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f731b01a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72c9490897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f72ca3f3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f7316208e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f731b24f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f731b01a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f106098c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1061c65c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1061c6aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1061c6bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f10ad704e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f10b274b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f10b2516353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f106098c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1061c65c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1061c6aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1061c6bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f10ad704e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f10b274b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f10b2516353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f106098c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f10618ef119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f10ad704e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f10b274b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f10b2516353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcae63f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcae76cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcae76cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcae76d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fcb33169e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fcb381b0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fcb37f7b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcae63f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcae76cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcae76cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcae76d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fcb33169e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fcb381b0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fcb37f7b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcae63f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fcae7354119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fcb33169e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fcb381b0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fcb37f7b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a304ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a31785c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a3178aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a3178bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5a7d224e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5a8226b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5a82036353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a304ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a31785c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a3178aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a3178bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5a7d224e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5a8226b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5a82036353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a304ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f5a3140f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f5a7d224e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f5a8226b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f5a82036353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f169adf1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f169c0cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f169c0cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f169c0d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f16e7b69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f16ecbb0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f16ec97b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f169adf1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f169c0cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f169c0cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f169c0d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f16e7b69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f16ecbb0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f16ec97b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f169adf1897 in /fsx/[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=27, OpType=RECV, NumelIn=7, NumelOut=7, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f169bd54119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f16e7b69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f16ecbb0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f16ec97b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f20b5596897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f20b686fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f20b6874a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f20b6875dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f210230ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2107355609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2107120353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f20b5596897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f20b686fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f20b6874a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f20b6875dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f210230ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2107355609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2107120353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f20b5596897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f20b64f9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f210230ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f2107355609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f2107120353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc672bee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc673ec7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc673ecca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc673ecddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc6bf966e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc6c49ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc6c4778353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc672bee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc673ec7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc673ecca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc673ecddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc6bf966e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc6c49ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc6c4778353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc672bee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fc673b51119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fc6bf966e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fc6c49ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fc6c4778353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7febfb35c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7febfc635c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7febfc63aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7febfc63bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fec480d4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fec4d11b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fec4cee6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7febfb35c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7febfc635c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7febfc63aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7febfc63bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fec480d4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fec4d11b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fec4cee6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7febfb35c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7febfc2bf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fec480d4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fec4d11b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fec4cee6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c273c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2c286a1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2c286a6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2c286a7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2c74140e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2c79187609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2c78f52353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c273c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2c286a1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2c286a6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2c286a7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2c74140e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2c79187609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2c78f52353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c273c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f2c2832b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f2c74140e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f2c79187609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f2c78f52353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank41]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank41]: grad_accumulator.backward(sum(activations)) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank41]: result = loss.backward() -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank41]: torch.autograd.backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank41]: _engine_run_backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank41]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank41]: return user_fn(self, *args) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank41]: self.grads_buffer.append(recv_grad()) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank42]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank42]: grad_accumulator.backward(sum(activations)) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank42]: result = loss.backward() -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank42]: torch.autograd.backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank42]: _engine_run_backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank42]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank42]: return user_fn(self, *args) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank42]: self.grads_buffer.append(recv_grad()) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank39]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank39]: grad_accumulator.backward(sum(activations)) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank39]: result = loss.backward() -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank39]: torch.autograd.backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank39]: _engine_run_backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank39]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank39]: return user_fn(self, *args) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank39]: self.grads_buffer.append(recv_grad()) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank36]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank36]: grad_accumulator.backward(sum(activations)) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank36]: result = loss.backward() -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank36]: torch.autograd.backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank36]: _engine_run_backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank36]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank36]: return user_fn(self, *args) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank36]: self.grads_buffer.append(recv_grad()) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank43]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank43]: grad_accumulator.backward(sum(activations)) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank43]: result = loss.backward() -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank43]: torch.autograd.backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank43]: _engine_run_backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank43]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank43]: return user_fn(self, *args) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank43]: self.grads_buffer.append(recv_grad()) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank46]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank46]: grad_accumulator.backward(sum(activations)) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank46]: result = loss.backward() -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank46]: torch.autograd.backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank46]: _engine_run_backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank46]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank46]: return user_fn(self, *args) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank46]: self.grads_buffer.append(recv_grad()) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank34]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank34]: grad_accumulator.backward(sum(activations)) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank34]: result = loss.backward() -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank34]: torch.autograd.backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank34]: _engine_run_backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank34]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank34]: return user_fn(self, *args) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank34]: self.grads_buffer.append(recv_grad()) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank38]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank38]: grad_accumulator.backward(sum(activations)) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank38]: result = loss.backward() -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank38]: torch.autograd.backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank38]: _engine_run_backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank38]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank38]: return user_fn(self, *args) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank38]: self.grads_buffer.append(recv_grad()) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank33]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank33]: grad_accumulator.backward(sum(activations)) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank33]: result = loss.backward() -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank33]: torch.autograd.backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank33]: _engine_run_backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank33]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank33]: return user_fn(self, *args) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank33]: self.grads_buffer.append(recv_grad()) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank45]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank45]: grad_accumulator.backward(sum(activations)) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank45]: result = loss.backward() -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank45]: torch.autograd.backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank45]: _engine_run_backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank45]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank45]: return user_fn(self, *args) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank45]: self.grads_buffer.append(recv_grad()) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce250dc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce263b5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce263baa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce263bbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fce71e54e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fce76e9b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fce76c66353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce250dc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce263b5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce263baa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce263bbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fce71e54e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fce76e9b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fce76c66353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce250dc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fce2603f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fce71e54e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fce76e9b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fce76c66353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank32]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank32]: grad_accumulator.backward(sum(activations)) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank32]: result = loss.backward() -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank32]: torch.autograd.backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank32]: _engine_run_backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank32]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank32]: return user_fn(self, *args) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank32]: self.grads_buffer.append(recv_grad()) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank35]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank35]: grad_accumulator.backward(sum(activations)) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank35]: result = loss.backward() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank35]: torch.autograd.backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank35]: _engine_run_backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank35]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank35]: return user_fn(self, *args) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank35]: self.grads_buffer.append(recv_grad()) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank44]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank44]: grad_accumulator.backward(sum(activations)) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank44]: result = loss.backward() -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank44]: torch.autograd.backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank44]: _engine_run_backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank44]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank44]: return user_fn(self, *args) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank44]: self.grads_buffer.append(recv_grad()) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank40]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank40]: grad_accumulator.backward(sum(activations)) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank40]: result = loss.backward() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank40]: torch.autograd.backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank40]: _engine_run_backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank40]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank40]: return user_fn(self, *args) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank40]: self.grads_buffer.append(recv_grad()) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a99f0f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a9b1e8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a9b1eda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a9b1eedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5ae6c87e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5aebcce609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5aeba99353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a99f0f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a9b1e8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a9b1eda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a9b1eedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5ae6c87e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5aebcce609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5aeba99353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a99f0f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f5a9ae72119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f5ae6c87e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f5aebcce609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f5aeba99353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1d50bc7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1d51ea0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1d51ea5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1d51ea6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1d9d93fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1da2986609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1da2751353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1d50bc7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1d51ea0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1d51ea5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1d51ea6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1d9d93fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1da2986609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1da2751353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1d50bc7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f1d51b2a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f1d9d93fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f1da2986609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f1da2751353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a6f979897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2a70c52c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2a70c57a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2a70c58dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2abc6f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2ac1738609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2ac1503353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a6f979897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2a70c52c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2a70c57a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2a70c58dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2abc6f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2ac1738609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2ac1503353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a6f979897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f2a708dc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f2abc6f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f2ac1738609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f2ac1503353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank37]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank37]: grad_accumulator.backward(sum(activations)) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank37]: result = loss.backward() -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank37]: torch.autograd.backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank37]: _engine_run_backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank37]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank37]: return user_fn(self, *args) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank37]: self.grads_buffer.append(recv_grad()) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a5ad52897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a5c02bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a5c030a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a5c031dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5aa7acae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5aacb11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5aac8dc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a5ad52897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a5c02bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a5c030a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a5c031dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5aa7acae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5aacb11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5aac8dc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a5ad52897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f5a5bcb5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f5aa7acae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f5aacb11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f5aac8dc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f85cecac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f85cff85c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f85cff8aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f85cff8bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f861ba24e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8620a6b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8620836353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f85cecac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f85cff85c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f85cff8aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f85cff8bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f861ba24e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8620a6b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8620836353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f85cecac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f85cfc0f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f861ba24e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f8620a6b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f8620836353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank47]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank47]: grad_accumulator.backward(sum(activations)) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank47]: result = loss.backward() -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank47]: torch.autograd.backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank47]: _engine_run_backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank47]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank47]: return user_fn(self, *args) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank47]: self.grads_buffer.append(recv_grad()) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7b43444897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7b4471dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7b44722a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7b44723dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f7b901bce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f7b95203609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f7b94fce353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7b43444897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7b4471dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7b44722a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7b44723dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f7b901bce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f7b95203609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f7b94fce353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7b43444897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f7b443a7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f7b901bce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f7b95203609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f7b94fce353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c18fbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c1a298c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c1a29da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c1a29edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3c65d37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3c6ad7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3c6ab49353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c18fbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c1a298c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c1a29da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c1a29edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3c65d37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3c6ad7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3c6ab49353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c18fbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f3c19f22119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f3c65d37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f3c6ad7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f3c6ab49353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a6b797897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a6ca70c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a6ca75a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a6ca76dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5ab850fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5abd556609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5abd321353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a6b797897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a6ca70c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a6ca75a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a6ca76dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5ab850fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5abd556609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5abd321353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a6b797897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f5a6c6fa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f5ab850fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f5abd556609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f5abd321353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe4f655a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe4f7833c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe4f7838a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe4f7839dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe5432d2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fe548319609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe5480e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe4f655a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe4f7833c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe4f7838a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe4f7839dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe5432d2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbde204d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbde3326c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7fe548319609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe5480e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbde332ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbde332cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #4: + 0xd3e95 (0x7fbe2edc5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe4f655a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fe4f74bd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fe5432d2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fe548319609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fe5480e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #5: + 0x8609 (0x7fbe33e0c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fbe33bd7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbde204d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbde3326c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbde332ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbde332cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fbe2edc5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fbe33e0c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fbe33bd7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbde204d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fbde2fb0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fbe2edc5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fbe33e0c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fbe33bd7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa251088897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa252361c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa252366a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa252367dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa29de00e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa2a2e47609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa2a2c12353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa251088897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa252361c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa252366a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa252367dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa29de00e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa2a2e47609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa2a2c12353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa251088897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fa251feb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fa29de00e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fa2a2e47609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fa2a2c12353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7517408897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f75186e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f75186e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f75186e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7564180e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f75691c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f7568f92353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7517408897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f75186e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f75186e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f75186e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7564180e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f75691c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f7568f92353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7517408897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f751836b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f7564180e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f75691c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f7568f92353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fddce35f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fddcf638c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fddcf63da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fddcf63edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fde1b0d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fde2011e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fde1fee9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fddce35f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fddcf638c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fddcf63da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fddcf63edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fde1b0d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fde2011e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fde1fee9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fddce35f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fddcf2c2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fde1b0d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fde2011e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fde1fee9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14b5809897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f14b6ae2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f14b6ae7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f14b6ae8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1502581e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f15075c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1507393353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14b5809897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f14b6ae2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f14b6ae7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f14b6ae8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1502581e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f15075c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1507393353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14b5809897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f14b676c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f1502581e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f15075c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f1507393353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faaf6577897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faaf7850c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faaf7855a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faaf7856dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fab432efe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fab48336609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fab48101353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faaf6577897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faaf7850c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faaf7855a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faaf7856dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fab432efe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fab48336609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fab48101353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faaf6577897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7faaf74da119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fab432efe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fab48336609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fab48101353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3740a89897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3741d62c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3741d67a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3741d68dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f378d801e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3792848609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3792613353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3740a89897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3741d62c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3741d67a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3741d68dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f378d801e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3792848609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3792613353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3740a89897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f37419ec119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f378d801e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f3792848609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f3792613353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f120d61d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f120e8f6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f120e8fba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f120e8fcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f125a395e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f125f3dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f125f1a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f120d61d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f120e8f6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f120e8fba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f120e8fcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f125a395e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f125f3dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f125f1a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f120d61d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f120e580119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f125a395e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f125f3dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f125f1a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f22ea4a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f22eb77fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f22eb784a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f22eb785dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f233721ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f233c265609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f233c030353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f22ea4a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f22eb77fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f22eb784a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f22eb785dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f233721ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f233c265609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f233c030353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f22ea4a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f22eb409119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f233721ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f233c265609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f233c030353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e2d7c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8e2ea9bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8e2eaa0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8e2eaa1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f8e7a53ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f8e7f581609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f8e7f34c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e2d7c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8e2ea9bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8e2eaa0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8e2eaa1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f8e7a53ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f8e7f581609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f8e7f34c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e2d7c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f8e2e725119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f8e7a53ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f8e7f581609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f8e7f34c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3de144e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3de2727c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3de272ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3de272ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3e2e1c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3e3320d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3e32fd8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3de144e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3de2727c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3de272ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3de272ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3e2e1c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3e3320d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3e32fd8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3de144e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f3de23b1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f3e2e1c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f3e3320d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f3e32fd8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f822d678897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f822e951c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f822e956a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f822e957dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f827a3f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f827f437609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f827f202353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f822d678897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f822e951c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f822e956a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f822e957dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f827a3f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f827f437609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f827f202353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f822d678897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f822e5db119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f827a3f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f827f437609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f827f202353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e41e9a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e43173c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e43178a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e43179dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2e8ec12e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2e93c59609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2e93a24353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e41e9a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e43173c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e43178a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e43179dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2e8ec12e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2e93c59609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2e93a24353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e41e9a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f2e42dfd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f2e8ec12e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f2e93c59609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f2e93a24353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1cbc960897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1cbdc39c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1cbdc3ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1cbdc3fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1d096d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1d0e71f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1d0e4ea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1cbc960897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1cbdc39c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1cbdc3ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1cbdc3fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1d096d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1d0e71f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1d0e4ea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1cbc960897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f1cbd8c3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1d096d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f1d0e71f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f1d0e4ea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0f376d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0f389adc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0f389b2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0f389b3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f0f8444ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f0f89493609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f0f8925e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0f376d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0f389adc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0f389b2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0f389b3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f0f8444ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f0f89493609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f0f8925e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0f376d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f0f38637119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f0f8444ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f0f89493609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f0f8925e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc092cc1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc093f9ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc093f9fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc093fa0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc0dfa39e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc0e4a80609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc0e484b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc092cc1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc093f9ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc093f9fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc093fa0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc0dfa39e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc0e4a80609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc0e484b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc092cc1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fc093c24119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fc0dfa39e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fc0e4a80609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fc0e484b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa37ddb3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa37f08cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa37f091a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa37f092dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa3cab2be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa3cfb72609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa3cf93d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa37ddb3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa37f08cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa37f091a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa37f092dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa3cab2be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa3cfb72609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa3cf93d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa37ddb3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fa37ed16119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fa3cab2be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fa3cfb72609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fa3cf93d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa178084897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa17935dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa179362a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa179363dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa1c4dfce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fa1c9e43609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fa1c9c0e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa178084897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa17935dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa179362a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa179363dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa1c4dfce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fa1c9e43609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fa1c9c0e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa178084897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fa178fe7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fa1c4dfce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fa1c9e43609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fa1c9c0e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde2070f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fde219e8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fde219eda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fde219eedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fde6d487e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fde724ce609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fde72299353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde2070f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fde219e8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fde219eda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fde219eedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fde6d487e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fde724ce609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fde72299353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde2070f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fde21672119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fde6d487e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fde724ce609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fde72299353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbea440c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbea56e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbea56eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbea56ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fbef1184e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fbef61cb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fbef5f96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbea440c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbea56e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbea56eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbea56ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fbef1184e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fbef61cb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fbef5f96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbea440c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fbea536f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fbef1184e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fbef61cb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fbef5f96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f656b22a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f656c503c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f656c508a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f656c509dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f65b7fa2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f65bcfe9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f65bcdb4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f656b22a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f656c503c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f656c508a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f656c509dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f65b7fa2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f65bcfe9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f65bcdb4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f656b22a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f656c18d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f65b7fa2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f65bcfe9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f65bcdb4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54e430a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f54e55e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54e55e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54e55e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5531082e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f55360c9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5535e94353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54e430a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f54e55e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54e55e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54e55e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5531082e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f55360c9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5535e94353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54e430a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f54e526d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f5531082e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f55360c9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f5535e94353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f819db70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f819ee49c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f819ee4ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f819ee4fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f81ea8e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f81ef92f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f81ef6fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f819db70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f819ee49c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f819ee4ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f819ee4fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f81ea8e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f81ef92f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f81ef6fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f819db70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f819ead3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f81ea8e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f81ef92f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f81ef6fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce796b9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce7a992c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce7a997a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce7a998dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fcec6431e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fcecb478609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fcecb243353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce796b9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce7a992c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce7a997a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce7a998dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fcec6431e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fcecb478609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fcecb243353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce796b9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fce7a61c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fcec6431e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fcecb478609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fcecb243353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe440a3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe441d15c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe441d1aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe441d1bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fe48d7b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fe4927fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fe4925c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe440a3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe441d15c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe441d1aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe441d1bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fe48d7b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fe4927fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fe4925c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe440a3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fe44199f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fe48d7b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fe4927fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fe4925c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e510f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e523cfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e523d4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e523d5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2e9de6ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2ea2eb5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2ea2c80353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e510f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e523cfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e523d4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e523d5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2e9de6ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2ea2eb5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2ea2c80353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e510f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f2e52059119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f2e9de6ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f2ea2eb5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f2ea2c80353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f573bbd4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f573ceadc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f573ceb2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f573ceb3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f578894ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f578d993609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f578d75e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f573bbd4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f573ceadc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f573ceb2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f573ceb3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f578894ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f578d993609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f578d75e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f573bbd4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f573cb37119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f578894ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f578d993609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f578d75e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5ecdfab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5ecf284c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5ecf289a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5ecf28adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5f1ad23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5f1fd6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5f1fb35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5ecdfab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5ecf284c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5ecf289a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5ecf28adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5f1ad23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5f1fd6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5f1fb35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5ecdfab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f5ecef0e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f5f1ad23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f5f1fd6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f5f1fb35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1cb721b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1cb84f4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1cb84f9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1cb84fadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1d03f93e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1d08fda609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1d08da5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1cb721b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1cb84f4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1cb84f9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1cb84fadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1d03f93e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1d08fda609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1d08da5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1cb721b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f1cb817e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f1d03f93e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f1d08fda609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f1d08da5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 27, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=27, OpType=RECV, NumelIn=7, NumelOut=7, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdeee0c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdeef39dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdeef3a2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdeef3a3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fdf3ae3ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fdf3fe83609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fdf3fc4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=27, OpType=RECV, NumelIn=7, NumelOut=7, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdeee0c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdeef39dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdeef3a2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdeef3a3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fdf3ae3ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fdf3fe83609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fdf3fc4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdeee0c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fdeef027119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fdf3ae3ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fdf3fe83609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fdf3fc4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24d9a49897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24dad22c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24dad27a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24dad28dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f25267c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f252b808609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f252b5d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24d9a49897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24dad22c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24dad27a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24dad28dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f25267c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f252b808609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f252b5d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24d9a49897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f24da9ac119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f25267c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f252b808609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f252b5d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd2e22fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd2e35d6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd2e35dba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd2e35dcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd32f075e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd3340bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd333e87353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd2e22fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd2e35d6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd2e35dba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd2e35dcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd32f075e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd3340bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd333e87353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd2e22fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd2e3260119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd32f075e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd3340bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd333e87353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f726dd73897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f726f04cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f726f051a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f726f052dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f72baaebe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f72bfb32609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f72bf8fd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f726dd73897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f726f04cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f726f051a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f726f052dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f72baaebe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f72bfb32609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f72bf8fd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f726dd73897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f726ecd6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f72baaebe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f72bfb32609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f72bf8fd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9ca85d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc9cbb36c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc9cbb3ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc9cbb3cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fca175d5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fca1c61c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fca1c3e7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9ca85d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc9cbb36c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc9cbb3ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc9cbb3cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fca175d5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fca1c61c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fca1c3e7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9ca85d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fc9cb7c0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fca175d5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fca1c61c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fca1c3e7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f25020e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f25033bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f25033c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f25033c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f254ee5ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2553ea5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2553c70353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f25020e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f25033bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f25033c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f25033c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f254ee5ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2553ea5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2553c70353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f25020e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f2503049119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f254ee5ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f2553ea5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f2553c70353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -W0703 09:48:38.350000 139984102647616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 43173 closing signal SIGTERM -W0703 09:48:38.350000 139984102647616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 43174 closing signal SIGTERM -W0703 09:48:38.350000 139984102647616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 43175 closing signal SIGTERM -W0703 09:48:38.350000 139984102647616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 43176 closing signal SIGTERM -W0703 09:48:38.351000 139984102647616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 43177 closing signal SIGTERM -W0703 09:48:38.351000 139984102647616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 43179 closing signal SIGTERM -W0703 09:48:38.351000 139984102647616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 43180 closing signal SIGTERM -W0703 09:48:38.356000 140224669833024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 206075 closing signal SIGTERM -W0703 09:48:38.357000 140224669833024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 206076 closing signal SIGTERM -W0703 09:48:38.357000 140224669833024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 206077 closing signal SIGTERM -W0703 09:48:38.357000 140224669833024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 206078 closing signal SIGTERM -W0703 09:48:38.357000 140224669833024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 206079 closing signal SIGTERM -W0703 09:48:38.357000 140224669833024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 206080 closing signal SIGTERM -W0703 09:48:38.357000 140224669833024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 206081 closing signal SIGTERM -W0703 09:48:38.370000 140488241452864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 947350 closing signal SIGTERM -W0703 09:48:38.370000 140488241452864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 947351 closing signal SIGTERM -W0703 09:48:38.370000 140488241452864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 947353 closing signal SIGTERM -W0703 09:48:38.370000 140488241452864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 947354 closing signal SIGTERM -W0703 09:48:38.370000 140488241452864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 947355 closing signal SIGTERM -W0703 09:48:38.370000 140488241452864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 947356 closing signal SIGTERM -W0703 09:48:38.370000 140488241452864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 947357 closing signal SIGTERM -E0703 09:48:38.564000 140508561835840 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 76995) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:48:38 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 76996) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 76996 -[2]: - time : 2024-07-03_09:48:38 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 76997) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 76997 -[3]: - time : 2024-07-03_09:48:38 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 76998) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 76998 -[4]: - time : 2024-07-03_09:48:38 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 76999) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 76999 -[5]: - time : 2024-07-03_09:48:38 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 77000) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 77000 -[6]: - time : 2024-07-03_09:48:38 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 77001) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 77001 -[7]: - time : 2024-07-03_09:48:38 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 77002) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 77002 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:48:38 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 76995) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 76995 -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -E0703 09:48:39.745000 140224669833024 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 206074) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:48:39.763000 140224669833024 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_206005_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:39.793000 140224669833024 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_206005_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:39.801000 140224669833024 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_206005_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:48:38 - host : ip-26-0-166-125.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 206074) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 206074 -============================================================ -E0703 09:48:40.161000 140488241452864 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 947352) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:48:40.179000 140488241452864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_947282_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:40.208000 140488241452864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_947282_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:40.216000 140488241452864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_947282_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:48:38 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 947352) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 947352 -============================================================ -E0703 09:48:40.256000 139984102647616 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 5 (pid: 43178) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:48:40.273000 139984102647616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_43104_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:40.302000 139984102647616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_43104_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:40.311000 139984102647616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_43104_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:48:38 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 43178) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 43178 -============================================================ -srun: error: ip-26-0-166-125: task 7: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -W0703 09:48:42.131000 139705931302656 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-138.ec2.internal_744323_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:43.059000 139880909489920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_962140_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:43.066000 140208700663552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_472546_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:43.096000 139665137866496 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-147.ec2.internal_859998_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 09:48:43.442000 139670798600000 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 860067) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:48:43.455000 139670798600000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_859998_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:43.488000 139670798600000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_859998_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 09:48:43.506000 140214361397056 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 472616) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:48:43.518000 139670798600000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_859998_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -W0703 09:48:43.519000 140214361397056 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_472546_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:48:43 - host : ip-26-0-163-147.ec2.internal - rank : 33 (local_rank: 1) - exitcode : -6 (pid: 860068) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 860068 -[2]: - time : 2024-07-03_09:48:43 - host : ip-26-0-163-147.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 860069) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 860069 -[3]: - time : 2024-07-03_09:48:43 - host : ip-26-0-163-147.ec2.internal - rank : 35 (local_rank: 3) - exitcode : -6 (pid: 860070) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 860070 -[4]: - time : 2024-07-03_09:48:43 - host : ip-26-0-163-147.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 860071) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 860071 -[5]: - time : 2024-07-03_09:48:43 - host : ip-26-0-163-147.ec2.internal - rank : 37 (local_rank: 5) - exitcode : -6 (pid: 860072) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 860072 -[6]: - time : 2024-07-03_09:48:43 - host : ip-26-0-163-147.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 860073) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 860073 -[7]: - time : 2024-07-03_09:48:43 - host : ip-26-0-163-147.ec2.internal - rank : 39 (local_rank: 7) - exitcode : -6 (pid: 860074) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 860074 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:48:43 - host : ip-26-0-163-147.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 860067) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 860067 -============================================================ -W0703 09:48:43.550000 140214361397056 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_472546_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 09:48:43.567000 139886570223424 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 962209) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:48:43.579000 140214361397056 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_472546_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -W0703 09:48:43.580000 139886570223424 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_962140_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:48:43 - host : ip-26-0-164-207.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 472617) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 472617 -[2]: - time : 2024-07-03_09:48:43 - host : ip-26-0-164-207.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 472618) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 472618 -[3]: - time : 2024-07-03_09:48:43 - host : ip-26-0-164-207.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 472619) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 472619 -[4]: - time : 2024-07-03_09:48:43 - host : ip-26-0-164-207.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 472620) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 472620 -[5]: - time : 2024-07-03_09:48:43 - host : ip-26-0-164-207.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 472621) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 472621 -[6]: - time : 2024-07-03_09:48:43 - host : ip-26-0-164-207.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 472622) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 472622 -[7]: - time : 2024-07-03_09:48:43 - host : ip-26-0-164-207.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 472623) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 472623 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:48:43 - host : ip-26-0-164-207.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 472616) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 472616 -============================================================ -E0703 09:48:43.597000 139711592036160 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 744392) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:48:43.609000 139886570223424 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_962140_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:43.610000 139711592036160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_744323_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:48:43.637000 139886570223424 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_962140_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -W0703 09:48:43.639000 139711592036160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_744323_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:48:43 - host : ip-26-0-165-24.ec2.internal - rank : 49 (local_rank: 1) - exitcode : -6 (pid: 962210) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 962210 -[2]: - time : 2024-07-03_09:48:43 - host : ip-26-0-165-24.ec2.internal - rank : 50 (local_rank: 2) - exitcode : -6 (pid: 962211) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 962211 -[3]: - time : 2024-07-03_09:48:43 - host : ip-26-0-165-24.ec2.internal - rank : 51 (local_rank: 3) - exitcode : -6 (pid: 962212) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 962212 -[4]: - time : 2024-07-03_09:48:43 - host : ip-26-0-165-24.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 962213) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 962213 -[5]: - time : 2024-07-03_09:48:43 - host : ip-26-0-165-24.ec2.internal - rank : 53 (local_rank: 5) - exitcode : -6 (pid: 962214) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 962214 -[6]: - time : 2024-07-03_09:48:43 - host : ip-26-0-165-24.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 962215) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 962215 -[7]: - time : 2024-07-03_09:48:43 - host : ip-26-0-165-24.ec2.internal - rank : 55 (local_rank: 7) - exitcode : -6 (pid: 962216) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 962216 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:48:43 - host : ip-26-0-165-24.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 962209) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 962209 -============================================================ -W0703 09:48:43.668000 139711592036160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_744323_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:48:43 - host : ip-26-0-161-138.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 744393) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 744393 -[2]: - time : 2024-07-03_09:48:43 - host : ip-26-0-161-138.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 744394) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 744394 -[3]: - time : 2024-07-03_09:48:43 - host : ip-26-0-161-138.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 744395) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 744395 -[4]: - time : 2024-07-03_09:48:43 - host : ip-26-0-161-138.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 744396) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 744396 -[5]: - time : 2024-07-03_09:48:43 - host : ip-26-0-161-138.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 744397) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 744397 -[6]: - time : 2024-07-03_09:48:43 - host : ip-26-0-161-138.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 744398) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 744398 -[7]: - time : 2024-07-03_09:48:43 - host : ip-26-0-161-138.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 744399) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 744399 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:48:43 - host : ip-26-0-161-138.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 744392) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 744392 -============================================================ -srun: error: ip-26-0-163-147: task 4: Exited with exit code 1 -srun: error: ip-26-0-165-24: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-138: task 3: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/status.txt b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-1/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/bench.slurm deleted file mode 100644 index 08553158101235e80c3b31568a46297505d17134..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/config.yaml b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/config.yaml deleted file mode 100644 index 0c2506206c11681f3b7246bed29f0fd2250bf870..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 4 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 8 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 128 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/log.out b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/log.out deleted file mode 100644 index 615616a09b6841ab1e6a0ff76cb8a3983591df49..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/log.out +++ /dev/null @@ -1,1845 +0,0 @@ -======================== -START TIME: Wed Jul 3 02:58:45 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 02:58:48.295000 140253797914432 torch/distributed/run.py:757] -W0703 02:58:48.295000 140253797914432 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.295000 140253797914432 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:48.295000 140253797914432 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.294000 139964713871168 torch/distributed/run.py:757] -W0703 02:58:48.294000 139964713871168 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.294000 139964713871168 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:48.294000 139964713871168 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.295000 140152514582336 torch/distributed/run.py:757] -W0703 02:58:48.295000 140152514582336 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.295000 140152514582336 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:48.295000 140152514582336 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.293000 140292847032128 torch/distributed/run.py:757] -W0703 02:58:48.293000 140292847032128 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.293000 140292847032128 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:48.293000 140292847032128 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.301000 140334638495552 torch/distributed/run.py:757] -W0703 02:58:48.301000 140334638495552 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.301000 140334638495552 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:48.301000 140334638495552 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.302000 140431831271232 torch/distributed/run.py:757] -W0703 02:58:48.302000 140431831271232 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.302000 140431831271232 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:48.302000 140431831271232 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.307000 140571390818112 torch/distributed/run.py:757] -W0703 02:58:48.307000 140571390818112 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.307000 140571390818112 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:48.307000 140571390818112 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.382000 140162037520192 torch/distributed/run.py:757] -W0703 02:58:48.382000 140162037520192 torch/distributed/run.py:757] ***************************************** -W0703 02:58:48.382000 140162037520192 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:48.382000 140162037520192 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 02:59:08 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config: -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: run='%date_%jobid', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: step=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: consumed_train_samples=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: benchmark_csv_path=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ignore_sanity_checks=True), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp=4, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp=16, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp_engine=, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_mode=, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_linear_async_communication=False, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: expert_parallel_size=1), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dtype=torch.bfloat16, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_revision=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_max_length=None), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoint_interval=100000, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: save_initial_state=False, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: resume_checkpoint_path=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: log_level_replica='info', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration_step_info_interval=1), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: train_steps=20, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: micro_batch_size=128, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: batch_accumulation_per_replica=8, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: val_check_interval=-1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_val_batches=0, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_test_batches=0), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta1=0.9, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta2=0.95, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: torch_adam_is_fused=True, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: name='adamW'), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: zero_stage=1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: weight_decay=0.01, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: clip_grad=1.0, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_steps=1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_style='linear', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_style='linear', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_steps=19, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_starting_step=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: min_decay_lr=1e-05)), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: start_training_step=1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_splits='train', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_config_name=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_overwrite_cache=False, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: text_column_name='text'), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_loading_workers=0))], -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128')), -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lighteval=None) -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Model Config: -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272) -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Building model.. -[default0]:07/03/2024 02:59:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Setting PP block ranks... -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-247]: No checkpoint path provided. -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-247]: No checkpoint path provided. -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-247]: No checkpoint path provided. -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-247]: No checkpoint path provided. -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-247]: No checkpoint path provided. -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-247]: No checkpoint path provided. -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-247]: No checkpoint path provided. -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=1|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=1|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=1|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=2|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=2|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=2|ip-26-0-173-246]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Parametrizing model parameters using StandardParametrizator -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=6|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=6|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=3|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=3|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=3|ip-26-0-169-139]: No checkpoint path provided. -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=5|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=5|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=5|ip-26-0-169-139]: No checkpoint path provided. -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=6|ip-26-0-173-246]: No checkpoint path provided. -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: No checkpoint path provided. -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=14|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=15|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=15|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=15|ip-26-0-174-36]: No checkpoint path provided. -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=2|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=2|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=2|ip-26-0-169-139]: No checkpoint path provided. -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-247]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=14|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=14|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=13|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=13|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=13|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=8|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=8|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=8|ip-26-0-174-36]: No checkpoint path provided. -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: No checkpoint path provided. -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: No checkpoint path provided. -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=4|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=4|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=4|ip-26-0-169-139]: No checkpoint path provided. -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=7|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=7|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=7|ip-26-0-173-246]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=0|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=0|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=0|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=3|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=1|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=3|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=3|ip-26-0-173-246]: No checkpoint path provided. -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=1|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=1|ip-26-0-169-139]: No checkpoint path provided. -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=5|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=5|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=5|ip-26-0-173-246]: No checkpoint path provided. -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=7|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=7|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=4|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=7|ip-26-0-169-139]: No checkpoint path provided. -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=4|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=4|ip-26-0-173-246]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: No checkpoint path provided. -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: No checkpoint path provided. -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: No checkpoint path provided. -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=9|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=9|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=9|ip-26-0-174-36]: No checkpoint path provided. -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=11|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=11|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=11|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=10|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=10|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=10|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=6|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=6|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=6|ip-26-0-169-139]: No checkpoint path provided. -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=0|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=0|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 02:59:25 [INFO|DP=0|PP=2|TP=0|ip-26-0-169-139]: No checkpoint path provided. -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=12|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=12|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 02:59:25 [INFO|DP=0|PP=3|TP=12|ip-26-0-174-36]: No checkpoint path provided. -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 02:59:25 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 02:59:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 02:59:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 02:59:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 0 has 24.8M out of 24.8M (100.00%) params' optimizer states -[default0]:07/03/2024 02:59:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 02:59:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Using `datasets` library -[default0]:07/03/2024 02:59:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 02:59:29 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:59:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 02:59:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 02:59:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: -[default0]:07/03/2024 02:59:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Start training] datetime: 2024-07-03 02:59:31.221273 | mbs: 128 | grad_accum: 8 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 02:59:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 02:59:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 244.38MiB. Peak allocated 244.38MiB. Peak reserved: 266.00MiB -[default3]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=8|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=13|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=10|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=9|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=12|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=11|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=15|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=2|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=3|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=5|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=6|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=15|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=14|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=8|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=13|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=14|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=2|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=0|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=3|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=7|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=1|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=4|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:59:31 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=4|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=5|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=9|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=5|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=11|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=1|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=0|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=4|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=10|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=6|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=7|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=0|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=6|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=3|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=1|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:59:31 [WARNING|DP=0|PP=3|TP=12|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:59:31 [WARNING|DP=0|PP=2|TP=7|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:59:31 [WARNING|DP=0|PP=1|TP=2|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:59:36 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank15]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank15]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/ten[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -sor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.54 GiB is free. Including non-PyTorch memory, this process has 77.78 GiB memory in use. Of the allocated memory 67.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: sharded_logits = self.model( -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank7]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -n/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward - -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank14]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank14]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.45 GiB is free. Including non-PyTorch memory, this process has 77.87 GiB memory in use. Of the allocated memory 67.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: trainer.train(dataloader) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank8]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank9]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank9]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: Traceback (most recent call last): -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: Traceback (most recent call last): -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default4]:[rank12]: trainer.train(dataloader) -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: trainer.train(dataloader) -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.54 GiB is free. Including non-PyTorch memory, this process has 77.78 GiB memory in use. Of the allocated memory 67.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank11]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default7]:[rank7]: out = differentiable_reduce_scatter_sum(out, group=group) -[default2]:[rank10]: output = model(**micro_batch) -[default6]:[rank6]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default7]:[rank7]: sharded_tensor = torch.empty( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 39.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default6]:[rank6]: out = differentiable_reduce_scatter_sum(out, group=group) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default6]:[rank6]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default6]:[rank6]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default6]:[rank6]: sharded_tensor = torch.empty( -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 127.94 MiB is free. Including non-PyTorch memory, this process has 79.19 GiB memory in use. Of the allocated memory 69.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank11]: output = model(**micro_batch) -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: output = model(**micro_batch) -[default5]:[rank13]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank10]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank1]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: out = differentiable_reduce_scatter_sum(out, group=group) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default1]:[rank1]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default1]:[rank1]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default1]:[rank1]: sharded_tensor = torch.empty( -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 39.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nano[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -tron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in [default3]:[rank11]: sharded_logits = self.model( -forward -[default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank2]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default2]:[rank2]: out = differentiable_reduce_scatter_sum(out, group=group) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default2]:[rank2]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default2]:[rank2]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default2]:[rank2]: sharded_tensor = torch.empty( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 127.94 MiB is free. Including non-PyTorch memory, this process has 79.19 GiB memory in use. Of the allocated memory 69.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: Traceback (most recent call last): -[default5]:[rank5]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default0]:[rank0]: trainer.train(dataloader) -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: output = model(**micro_batch) -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank11]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank10]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return row_linear( -[default0]:[rank0]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default5]:[rank13]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank5]: out = differentiable_reduce_scatter_sum(out, group=group) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default2]:[rank10]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank5]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default3]:[rank11]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default5]:[rank5]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank0]: out = differentiable_reduce_scatter_sum(out, group=group) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default5]:[rank5]: sharded_tensor = torch.empty( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default4]:[rank12]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank0]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 39.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default0]:[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default0]:[rank0]: sharded_tensor = torch.empty( -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: trainer.train(dataloader) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank11]: return row_linear( -[default3]:[rank3]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.54 GiB is free. Including non-PyTorch memory, this process has 77.78 GiB memory in use. Of the allocated memory 67.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.54 GiB is free. Including non-PyTorch memory, this process has 77.78 GiB memory in use. Of the allocated memory 67.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.45 GiB is free. Including non-PyTorch memory, this process has 77.87 GiB memory in use. Of the allocated memory 67.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.45 GiB is free. Including non-PyTorch memory, this process has 77.87 GiB memory in use. Of the allocated memory 67.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: return row_linear( -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default4]:[rank4]: out = differentiable_reduce_scatter_sum(out, group=group) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default4]:[rank4]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default4]:[rank4]: sharded_tensor = torch.empty( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 127.94 MiB is free. Including non-PyTorch memory, this process has 79.19 GiB memory in use. Of the allocated memory 69.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank3]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: return row_linear( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default3]:[rank3]: out = differentiable_reduce_scatter_sum(out, group=group) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default3]:[rank3]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default3]:[rank3]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default3]:[rank3]: sharded_tensor = torch.empty( -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 39.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.96 GiB is allocated by PyTorch, and 58.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -W0703 02:59:54.677000 140334638495552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1657033 closing signal SIGTERM -W0703 02:59:54.678000 140334638495552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1657035 closing signal SIGTERM -E0703 02:59:55.497000 140334638495552 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1657032) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:59:54 - host : ip-26-0-162-233.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1657034) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:59:54 - host : ip-26-0-162-233.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1657036) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:59:54 - host : ip-26-0-162-233.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1657037) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:59:54 - host : ip-26-0-162-233.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1657038) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:59:54 - host : ip-26-0-162-233.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1657039) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:59:54 - host : ip-26-0-162-233.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1657032) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-162-233: task 0: Exited with exit code 1 -W0703 02:59:58.669000 140146853848832 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_893718_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:59:59.387000 140287186298624 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-147.ec2.internal_791043_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:59:59.428000 140426170537728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-246.ec2.internal_319266_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:59:59.465000 140248137180928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-247.ec2.internal_320736_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:59:59.482000 140156376786688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_404303_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:59:59.587000 139959053137664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-174-36.ec2.internal_833164_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:59:59.637000 140565730084608 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-139.ec2.internal_202106_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:59:59.670000 140253797914432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320805 closing signal SIGTERM -W0703 02:59:59.670000 140253797914432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320806 closing signal SIGTERM -W0703 02:59:59.670000 140253797914432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320807 closing signal SIGTERM -W0703 02:59:59.670000 140253797914432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320808 closing signal SIGTERM -W0703 02:59:59.672000 140253797914432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320809 closing signal SIGTERM -W0703 02:59:59.672000 140253797914432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320810 closing signal SIGTERM -W0703 02:59:59.672000 140253797914432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320811 closing signal SIGTERM -W0703 02:59:59.674000 140253797914432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320812 closing signal SIGTERM -W0703 02:59:59.685000 140431831271232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 319335 closing signal SIGTERM -W0703 02:59:59.685000 140431831271232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 319336 closing signal SIGTERM -W0703 02:59:59.685000 140431831271232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 319337 closing signal SIGTERM -W0703 02:59:59.686000 140152514582336 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 893787 closing signal SIGTERM -W0703 02:59:59.687000 140152514582336 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 893788 closing signal SIGTERM -W0703 02:59:59.687000 140152514582336 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 893789 closing signal SIGTERM -W0703 02:59:59.687000 140152514582336 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 893790 closing signal SIGTERM -W0703 02:59:59.685000 140431831271232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 319338 closing signal SIGTERM -W0703 02:59:59.687000 139964713871168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 833233 closing signal SIGTERM -W0703 02:59:59.687000 139964713871168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 833234 closing signal SIGTERM -W0703 02:59:59.686000 140571390818112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202174 closing signal SIGTERM -W0703 02:59:59.686000 140571390818112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202175 closing signal SIGTERM -W0703 02:59:59.688000 139964713871168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 833235 closing signal SIGTERM -W0703 02:59:59.686000 140571390818112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202176 closing signal SIGTERM -W0703 02:59:59.688000 139964713871168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 833236 closing signal SIGTERM -W0703 02:59:59.688000 140431831271232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 319339 closing signal SIGTERM -W0703 02:59:59.688000 140162037520192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 404373 closing signal SIGTERM -W0703 02:59:59.689000 139964713871168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 833237 closing signal SIGTERM -W0703 02:59:59.687000 140571390818112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202177 closing signal SIGTERM -W0703 02:59:59.689000 140162037520192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 404374 closing signal SIGTERM -W0703 02:59:59.689000 140162037520192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 404375 closing signal SIGTERM -W0703 02:59:59.689000 140162037520192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 404376 closing signal SIGTERM -W0703 02:59:59.688000 140571390818112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202178 closing signal SIGTERM -W0703 02:59:59.688000 140571390818112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202179 closing signal SIGTERM -W0703 02:59:59.691000 140152514582336 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 893791 closing signal SIGTERM -W0703 02:59:59.691000 140152514582336 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 893792 closing signal SIGTERM -W0703 02:59:59.691000 140152514582336 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 893793 closing signal SIGTERM -W0703 02:59:59.690000 140431831271232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 319340 closing signal SIGTERM -W0703 02:59:59.690000 140431831271232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 319341 closing signal SIGTERM -W0703 02:59:59.691000 139964713871168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 833238 closing signal SIGTERM -W0703 02:59:59.691000 139964713871168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 833239 closing signal SIGTERM -W0703 02:59:59.690000 140431831271232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 319342 closing signal SIGTERM -W0703 02:59:59.691000 139964713871168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 833240 closing signal SIGTERM -W0703 02:59:59.691000 140152514582336 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 893794 closing signal SIGTERM -W0703 02:59:59.690000 140571390818112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202180 closing signal SIGTERM -W0703 02:59:59.691000 140571390818112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202181 closing signal SIGTERM -W0703 02:59:59.693000 140162037520192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 404377 closing signal SIGTERM -W0703 02:59:59.693000 140162037520192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 404378 closing signal SIGTERM -W0703 02:59:59.693000 140162037520192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 404379 closing signal SIGTERM -W0703 02:59:59.694000 140162037520192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 404380 closing signal SIGTERM -E0703 02:59:59.800000 140292847032128 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 791112) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 02:59:59.806000 140292847032128 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_791043_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:59:59.832000 140292847032128 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_791043_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:59:59.868000 140292847032128 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_791043_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:59:59 - host : ip-26-0-163-147.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 791113) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:59:59 - host : ip-26-0-163-147.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 791114) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:59:59 - host : ip-26-0-163-147.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 791115) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:59:59 - host : ip-26-0-163-147.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 791116) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:59:59 - host : ip-26-0-163-147.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 791117) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:59:59 - host : ip-26-0-163-147.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 791118) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:59:59 - host : ip-26-0-163-147.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 791119) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:59:59 - host : ip-26-0-163-147.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 791112) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-163-147: task 1: Exited with exit code 1 -W0703 03:00:03.674000 140146853848832 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_893718_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:04.432000 140426170537728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-246.ec2.internal_319266_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:04.469000 140248137180928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-247.ec2.internal_320736_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:04.487000 140156376786688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_404303_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:04.591000 139959053137664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-174-36.ec2.internal_833164_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:04.642000 140565730084608 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-139.ec2.internal_202106_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:08.678000 140146853848832 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_893718_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:08.924000 139964713871168 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_833164_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:08.931000 139964713871168 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_833164_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-174-36: task 7: Exited with exit code 1 -W0703 03:00:09.436000 140426170537728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-246.ec2.internal_319266_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:09.473000 140248137180928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-247.ec2.internal_320736_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:09.492000 140156376786688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_404303_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:09.646000 140565730084608 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-139.ec2.internal_202106_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:12.418000 140431831271232 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_319266_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:12.425000 140431831271232 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_319266_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 03:00:12.622000 140253797914432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_320736_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:12.629000 140253797914432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_320736_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-173-246: task 6: Exited with exit code 1 -W0703 03:00:12.828000 140162037520192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_404303_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:12.836000 140162037520192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_404303_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-169-247: task 5: Exited with exit code 1 -W0703 03:00:13.024000 140571390818112 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-139.ec2.internal_202106_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:13.032000 140571390818112 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-139.ec2.internal_202106_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-169-139: task 4: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 2: Exited with exit code 1 -W0703 03:00:13.625000 140152514582336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_893718_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:00:13.632000 140152514582336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_893718_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/status.txt b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-128/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/bench.slurm deleted file mode 100644 index a26f3af3c75858b46a243b7853ce4a0095cca337..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/config.yaml b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/config.yaml deleted file mode 100644 index 50084a2d412ee6020743a3c7fb08cec977f2de19..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 4 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 64 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 16 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out deleted file mode 100644 index c5b257f20caf466ba3523c6a0772cb1d45f53f0f..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/log.out +++ /dev/null @@ -1,5697 +0,0 @@ -======================== -START TIME: Wed Jul 3 04:48:59 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 04:49:06.845000 139925998749504 torch/distributed/run.py:757] -W0703 04:49:06.845000 139925998749504 torch/distributed/run.py:757] ***************************************** -W0703 04:49:06.845000 139925998749504 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:49:06.845000 139925998749504 torch/distributed/run.py:757] ***************************************** -W0703 04:49:06.906000 139952454182720 torch/distributed/run.py:757] -W0703 04:49:06.906000 139952454182720 torch/distributed/run.py:757] ***************************************** -W0703 04:49:06.906000 139952454182720 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:49:06.906000 139952454182720 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.135000 140399011194688 torch/distributed/run.py:757] -W0703 04:49:07.135000 140399011194688 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.135000 140399011194688 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:49:07.135000 140399011194688 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.301000 139992165906240 torch/distributed/run.py:757] -W0703 04:49:07.301000 139992165906240 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.301000 139992165906240 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:49:07.301000 139992165906240 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.304000 139768925599552 torch/distributed/run.py:757] -W0703 04:49:07.304000 139768925599552 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.304000 139768925599552 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:49:07.304000 139768925599552 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.356000 139875827971904 torch/distributed/run.py:757] -W0703 04:49:07.356000 139875827971904 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.356000 139875827971904 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:49:07.356000 139875827971904 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.410000 140252568274752 torch/distributed/run.py:757] -W0703 04:49:07.410000 140252568274752 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.410000 140252568274752 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:49:07.410000 140252568274752 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.415000 139965246457664 torch/distributed/run.py:757] -W0703 04:49:07.415000 139965246457664 torch/distributed/run.py:757] ***************************************** -W0703 04:49:07.415000 139965246457664 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:49:07.415000 139965246457664 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 04:49:32 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=4, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=16, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=64, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16')), -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 04:49:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=7|ip-26-0-161-78]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=7|ip-26-0-161-78]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=7|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=4|ip-26-0-161-78]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=4|ip-26-0-161-78]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=4|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=5|ip-26-0-161-78]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=5|ip-26-0-161-78]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=3|ip-26-0-161-78]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=3|ip-26-0-161-78]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=3|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=5|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=0|ip-26-0-161-78]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=0|ip-26-0-161-78]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=0|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=6|ip-26-0-161-78]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=6|ip-26-0-161-78]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=6|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=2|ip-26-0-161-78]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=2|ip-26-0-161-78]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=2|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=1|ip-26-0-161-78]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=1|ip-26-0-161-78]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=1|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=8|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=14|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=12|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=13|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=10|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=15|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=9|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=8|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=8|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=10|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=10|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=9|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=9|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=11|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=11|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=11|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=2|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-138]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=4|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=4|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-138]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-138]: No checkpoint path provided. -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=12|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=12|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=14|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=14|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=5|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=5|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=13|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=13|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=7|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=7|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=6|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=6|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=15|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=15|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 04:49:51 [INFO|DP=0|PP=3|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 04:49:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 04:49:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 04:49:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 24.8M out of 24.8M (100.00%) params' optimizer states -[default0]:07/03/2024 04:49:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 04:49:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 04:49:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 04:49:54 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:49:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 04:49:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 04:49:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 04:49:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 04:49:56.613050 | mbs: 16 | grad_accum: 64 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 04:49:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 04:49:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 244.38MiB. Peak allocated 244.38MiB. Peak reserved: 266.00MiB -[default0]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=0|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=5|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=1|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=7|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=14|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=8|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=11|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=1|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=5|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=3|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=9|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=0|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=13|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=7|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=2|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=6|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:49:56 [WARNING|DP=0|PP=2|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=4|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=10|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=12|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:49:56 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:49:56 [WARNING|DP=0|PP=1|TP=15|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:49:57 [WARNING|DP=0|PP=2|TP=4|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:49:57 [WARNING|DP=0|PP=1|TP=2|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:49:57 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:49:56 [WARNING|DP=0|PP=3|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:49:57 [WARNING|DP=0|PP=1|TP=6|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:49:57 [WARNING|DP=0|PP=2|TP=3|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:50:00 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:50:02 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:50:02 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank23]: send_activation() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank23]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank23]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank23]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank23]: dist.send( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank23]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank22]: send_activation() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank22]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank22]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank22]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank22]: dist.send( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank22]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank17]: send_activation() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank17]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank17]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank17]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank17]: dist.send( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank17]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank28]: send_activation() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank28]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank28]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank28]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank28]: dist.send( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank28]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank29]: send_activation() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank29]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank29]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank29]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank29]: dist.send( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank29]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank5]: send_activation() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank5]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank5]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank5]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank5]: dist.send( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank5]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank1]: send_activation() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank6]: send_activation() -[default1]:[rank1]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank1]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank6]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank6]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank6]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank6]: dist.send( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank1]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank6]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank1]: dist.send( -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank1]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank8]: pipeline_state.run_communication() -[default3]:[rank11]: send_activation() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank8]: send_activation() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank8]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank8]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank8]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank8]: dist.send( -[default3]:[rank11]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank8]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank11]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank11]: dist.send( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank11]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank15]: send_activation() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank15]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank15]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank15]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank15]: dist.send( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank15]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank27]: send_activation() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank26]: _engine_run_backward( -[default3]:[rank27]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank27]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank27]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank26]: send_activation() -[default3]:[rank27]: dist.send( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank26]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank26]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank26]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank27]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank26]: dist.send( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank26]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank30]: send_activation() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank30]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank30]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank30]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank30]: dist.send( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank30]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank7]: send_activation() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank7]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank7]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank7]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank7]: dist.send( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank7]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank4]: send_activation() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank4]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank4]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank4]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank4]: dist.send( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank4]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank10]: send_activation() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank10]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank10]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank10]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank10]: dist.send( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank10]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank9]: send_activation() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank9]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank9]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank9]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank9]: dist.send( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank9]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank25]: send_activation() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank25]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank25]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank25]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank25]: dist.send( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank25]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef0b6fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef0c9d6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef0c9dba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef0c9dcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fef58475e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fef5d4bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fef5d287353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef0b6fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef0c9d6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef0c9dba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef0c9dcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fef58475e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fef5d4bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fef5d287353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef0b6fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fef0c660119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fef58475e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fef5d4bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fef5d287353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13d9502897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f13da7dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f13da7e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f13da7e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f142627ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f142b2c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f142b08c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13d9502897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f13da7dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f13da7e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f13da7e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f142627ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f142b2c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f142b08c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13d9502897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f13da465119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f142627ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f142b2c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f142b08c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: torch.autograd.backward( -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: result = loss.backward() -[default0]:[rank16]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default4]:[rank20]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: send_activation() -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank20]: send_activation() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank16]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank16]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank16]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank16]: dist.send( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default4]:[rank20]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank16]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank20]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank20]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank20]: dist.send( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank20]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank19]: send_activation() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank19]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank19]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank19]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank19]: dist.send( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank19]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank21]: send_activation() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank21]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank21]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank21]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank21]: dist.send( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank21]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank18]: send_activation() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank18]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank18]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank18]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank18]: dist.send( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank18]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank14]: send_activation() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank14]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank14]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank14]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank14]: dist.send( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank14]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank12]: send_activation() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank12]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank12]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank12]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank12]: dist.send( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank12]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank13]: send_activation() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank13]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank13]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank13]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank13]: dist.send( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank13]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f08f6a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1f0a243c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1f0a248a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1f0a249dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1f55ce2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1f5ad29609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1f5aaf4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f08f6a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1f0a243c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1f0a248a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1f0a249dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1f55ce2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1f5ad29609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1f5aaf4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f08f6a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f1f09ecd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f1f55ce2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f1f5ad29609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f1f5aaf4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fc5469897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5fc6742c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5fc6747a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5fc6748dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f60121e1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f6017228609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f6016ff3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fc5469897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5fc6742c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5fc6747a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5fc6748dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f60121e1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f6017228609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f6016ff3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fc5469897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f5fc63cc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f60121e1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f6017228609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f6016ff3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde3fdc3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fde4109cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fde410a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fde410a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fde8cb3be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fde91b82609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fde9194d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde3fdc3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fde4109cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fde410a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fde410a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fde8cb3be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fde91b82609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fde9194d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde3fdc3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fde40d26119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fde8cb3be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fde91b82609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fde9194d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e28ea5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e2a17ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e2a183a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e2a184dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4e75c1de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4e7ac64609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4e7aa2f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e28ea5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e2a17ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e2a183a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e2a184dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4e75c1de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4e7ac64609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4e7aa2f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e28ea5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4e29e08119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f4e75c1de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f4e7ac64609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4e7aa2f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0b15510897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0b167e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0b167eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0b167efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0b62288e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0b672cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0b6709a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0b15510897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0b167e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0b167eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0b167efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0b62288e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0b672cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0b6709a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0b15510897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f0b16473119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f0b62288e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f0b672cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f0b6709a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank3]: send_activation() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank3]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank3]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank3]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank3]: dist.send( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank3]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank2]: send_activation() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank2]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank2]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank2]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank2]: dist.send( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank2]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7f5eb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd7f718bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd7f7190a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd7f7191dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd842c2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd847c71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd847a3c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7f5eb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd7f718bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd7f7190a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd7f7191dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd842c2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd847c71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd847a3c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7f5eb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fd7f6e15119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fd842c2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fd847c71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fd847a3c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank31]: send_activation() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank31]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank31]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank31]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank31]: dist.send( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank31]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe638378897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe639651c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe639656a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe639657dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe6850f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe68a137609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe689f02353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe638378897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe639651c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe639656a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe639657dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe6850f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe68a137609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe689f02353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe638378897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fe6392db119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fe6850f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fe68a137609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fe689f02353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank0]: send_activation() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank0]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank0]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank0]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank0]: dist.send( -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank0]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b772d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5b785abc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5b785b0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5b785b1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5bc404ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5bc9091609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5bc8e5c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b772d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5b785abc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5b785b0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5b785b1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5bc404ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5bc9091609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5bc8e5c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b772d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f5b78235119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f5bc404ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f5bc9091609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f5bc8e5c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank24]: send_activation() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank24]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank24]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank24]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank24]: dist.send( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank24]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6908698897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6909971c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6909976a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6909977dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6955410e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f695a457609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f695a222353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6908698897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6909971c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6909976a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6909977dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6955410e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f695a457609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f695a222353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6908698897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f69095fb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f6955410e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f695a457609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f695a222353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd66d2a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd66e579c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd66e57ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd66e57fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd6ba018e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd6bf05f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd6bee2a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd66d2a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd66e579c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd66e57ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd66e57fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd6ba018e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd6bf05f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd6bee2a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd66d2a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd66e203119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd6ba018e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd6bf05f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd6bee2a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0174612897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f01758ebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f01758f0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f01758f1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f01c138ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f01c63d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f01c619c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default0]:Exception raised from checkTi[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc502c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc5159cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc515a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc515a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7efc9d03be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7efca2082609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7efca1e4d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc502c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc5159cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc515a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc515a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7efc9d03be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7efca2082609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7efca1e4d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc502c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7efc51226119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7efc9d03be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7efca2082609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7efca1e4d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -meout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0174612897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f01758ebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f01758f0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f01758f1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f01c138ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f01c63d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f01c619c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0174612897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f0175575119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f01c138ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f01c63d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f01c619c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa64898e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa649c67c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa649c6ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa649c6ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa695706e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fa69a74d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fa69a518353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa64898e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa649c67c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa649c6ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa649c6ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa695706e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fa69a74d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fa69a518353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa64898e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fa6498f1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fa695706e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fa69a74d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fa69a518353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7324d28897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7326001c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7326006a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7326007dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f7371aa0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f7376ae7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f73768b2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7324d28897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7326001c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7326006a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7326007dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f7371aa0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f7376ae7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f73768b2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7324d28897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f7325c8b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f7371aa0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f7376ae7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f73768b2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb697a5e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb698d37c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb698d3ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb698d3ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb6e47d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb6e981d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb6e95e8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb697a5e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb698d37c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb698d3ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb698d3ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb6e47d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb6e981d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb6e95e8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb697a5e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fb6989c1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb6e47d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb6e981d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb6e95e8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4d14649897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4d15922c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4d15927a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4d15928dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4d613c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4d66408609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f4d661d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4d14649897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4d15922c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4d15927a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4d15928dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4d613c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4d66408609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f4d661d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4d14649897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f4d155ac119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f4d613c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f4d66408609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f4d661d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3bd1d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff3be4b0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff3be4b5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff3be4b6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff409f4fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff40ef96609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff40ed61353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3bd1d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff3be4b0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff3be4b5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff3be4b6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff409f4fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff40ef96609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff40ed61353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3bd1d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7ff3be13a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7ff409f4fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7ff40ef96609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7ff40ed61353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank32]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank32]: grad_accumulator.backward(sum(activations)) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank32]: result = loss.backward() -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank32]: torch.autograd.backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank32]: _engine_run_backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank32]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank32]: return user_fn(self, *args) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank32]: self.grads_buffer.append(recv_grad()) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank34]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank34]: grad_accumulator.backward(sum(activations)) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank34]: result = loss.backward() -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank34]: torch.autograd.backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank34]: _engine_run_backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank34]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank34]: return user_fn(self, *args) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank34]: self.grads_buffer.append(recv_grad()) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank35]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank35]: grad_accumulator.backward(sum(activations)) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank35]: result = loss.backward() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank35]: torch.autograd.backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank35]: _engine_run_backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank35]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank35]: return user_fn(self, *args) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank35]: self.grads_buffer.append(recv_grad()) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank36]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank36]: grad_accumulator.backward(sum(activations)) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank36]: result = loss.backward() -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank36]: torch.autograd.backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank36]: _engine_run_backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank36]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank36]: return user_fn(self, *args) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank36]: self.grads_buffer.append(recv_grad()) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank37]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank37]: grad_accumulator.backward(sum(activations)) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank37]: result = loss.backward() -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank37]: torch.autograd.backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank37]: _engine_run_backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank37]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank37]: return user_fn(self, *args) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank37]: self.grads_buffer.append(recv_grad()) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank42]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank42]: grad_accumulator.backward(sum(activations)) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank42]: result = loss.backward() -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank42]: torch.autograd.backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank42]: _engine_run_backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank42]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank42]: return user_fn(self, *args) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank42]: self.grads_buffer.append(recv_grad()) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3cc6c5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3cc7f35c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3cc7f3aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3cc7f3bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3d139d4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3d18a1b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3d187e6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3cc6c5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3cc7f35c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3cc7f3aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3cc7f3bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3d139d4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3d18a1b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3d187e6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3cc6c5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f3cc7bbf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f3d139d4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f3d18a1b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f3d187e6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank43]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank43]: grad_accumulator.backward(sum(activations)) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank43]: result = loss.backward() -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank43]: torch.autograd.backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank43]: _engine_run_backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank43]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank43]: return user_fn(self, *args) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank43]: self.grads_buffer.append(recv_grad()) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f075d7fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f075ead6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f075eadba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f075eadcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f07aa575e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f07af5bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f07af387353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f075d7fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f075ead6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f075eadba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f075eadcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f07aa575e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f07af5bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f07af387353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f075d7fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f075e760119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f07aa575e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f07af5bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f07af387353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9b8ee06897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9b900dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9b900e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9b900e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f9bdbb7ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f9be0bc5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f9be0990353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9b8ee06897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9b900dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9b900e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9b900e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f9bdbb7ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f9be0bc5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f9be0990353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9b8ee06897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f9b8fd69119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f9bdbb7ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f9be0bc5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f9be0990353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank47]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank47]: grad_accumulator.backward(sum(activations)) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank47]: result = loss.backward() -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank47]: torch.autograd.backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank47]: _engine_run_backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank47]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank47]: return user_fn(self, *args) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank47]: self.grads_buffer.append(recv_grad()) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4df8ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4e0ba3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4e0ba8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4e0ba9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa52c642e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa531689609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa531454353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4df8ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4e0ba3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4e0ba8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4e0ba9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa52c642e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa531689609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa531454353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4df8ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fa4e082d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fa52c642e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fa531689609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fa531454353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc209cd2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc20afabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc20afb0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc20afb1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fc256a4ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fc25ba91609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fc25b85c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc209cd2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc20afabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc20afb0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc20afb1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fc256a4ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fc25ba91609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fc25b85c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc209cd2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fc20ac35119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fc256a4ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fc25ba91609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fc25b85c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank45]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank45]: grad_accumulator.backward(sum(activations)) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank45]: result = loss.backward() -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank45]: torch.autograd.backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank45]: _engine_run_backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank45]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank45]: return user_fn(self, *args) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank45]: self.grads_buffer.append(recv_grad()) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank41]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank41]: grad_accumulator.backward(sum(activations)) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank41]: result = loss.backward() -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank41]: torch.autograd.backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank41]: _engine_run_backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank41]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank41]: return user_fn(self, *args) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank41]: self.grads_buffer.append(recv_grad()) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efd6b15f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efd6c438c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efd6c43da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efd6c43edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7efdb7ed7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7efdbcf1e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7efdbcce9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efd6b15f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efd6c438c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efd6c43da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efd6c43edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7efdb7ed7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7efdbcf1e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7efdbcce9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efd6b15f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7efd6c0c2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7efdb7ed7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7efdbcf1e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7efdbcce9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9828e3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f982a115c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f982a11aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f982a11bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f9875bb4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f987abfb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f987a9c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9828e3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f982a115c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f982a11aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f982a11bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f9875bb4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f987abfb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f987a9c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9828e3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f9829d9f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f9875bb4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f987abfb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f987a9c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f88d541d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f88d66f6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f88d66fba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f88d66fcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f8922195e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f89271dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f8926fa7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f88d541d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f88d66f6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f88d66fba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f88d66fcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f8922195e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f89271dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f8926fa7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f88d541d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f88d6380119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f8922195e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f89271dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f8926fa7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcc964f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcca928c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcca92da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcca92edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fdd163c7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default1]:frame #5: + 0x8609 (0x7fdd1b40e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab74455897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #6: clone + 0x43 (0x7fdd1b1d9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab7572ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab75733a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab75734dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:frame #4: + 0xd3e95 (0x7fabc11cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default6]:frame #5: + 0x8609 (0x7fabc6214609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fabc5fdf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab74455897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab7572ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcc964f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcca928c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab75733a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcca92da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab75734dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcca92edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fabc11cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #4: + 0xd3e95 (0x7fdd163c7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fabc6214609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fabc5fdf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #5: + 0x8609 (0x7fdd1b40e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fdd1b1d9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab74455897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fab753b8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcc964f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #2: + 0xd3e95 (0x7fabc11cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fabc6214609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #1: + 0xe32119 (0x7fdcca5b2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: clone + 0x43 (0x7fabc5fdf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #2: + 0xd3e95 (0x7fdd163c7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]: -[default1]:frame #3: + 0x8609 (0x7fdd1b40e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fdd1b1d9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f532e70a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f532f9e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f532f9e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f532f9e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f537b482e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f53804c9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5380294353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f532e70a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f532f9e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f532f9e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f532f9e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f537b482e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f53804c9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5380294353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f532e70a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f532f66d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f537b482e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f53804c9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f5380294353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5db0c54897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5db1f2dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5db1f32a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5db1f33dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5dfd9cce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5e02a13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5e027de353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5db0c54897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5db1f2dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5db1f32a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5db1f33dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5dfd9cce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5e02a13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5e027de353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5db0c54897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f5db1bb7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f5dfd9cce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f5e02a13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f5e027de353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank38]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank38]: grad_accumulator.backward(sum(activations)) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank38]: result = loss.backward() -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank38]: torch.autograd.backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank38]: _engine_run_backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank38]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank38]: return user_fn(self, *args) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank38]: self.grads_buffer.append(recv_grad()) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank39]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank39]: grad_accumulator.backward(sum(activations)) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank39]: result = loss.backward() -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank39]: torch.autograd.backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank39]: _engine_run_backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank39]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank39]: return user_fn(self, *args) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank39]: self.grads_buffer.append(recv_grad()) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank33]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank33]: grad_accumulator.backward(sum(activations)) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank33]: result = loss.backward() -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank33]: torch.autograd.backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank33]: _engine_run_backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank33]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank33]: return user_fn(self, *args) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank33]: self.grads_buffer.append(recv_grad()) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff5e45e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff5e58bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff5e58c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff5e58c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff63135be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff6363a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff63616d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff5e45e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff5e58bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff5e58c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff5e58c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff63135be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff6363a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff63616d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff5e45e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7ff5e5546119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7ff63135be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7ff6363a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7ff63616d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f12ea6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f1417fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f14184a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f14185dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5f5fc1ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5f64c65609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5f64a30353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f12ea6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f1417fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f14184a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f14185dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5f5fc1ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5f64c65609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5f64a30353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f12ea6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f5f13e09119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f5f5fc1ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f5f64c65609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f5f64a30353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1e2ef05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1e301dec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1e301e3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1e301e4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1e7bc7de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1e80cc4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1e80a8f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1e2ef05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1e301dec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1e301e3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1e301e4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1e7bc7de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1e80cc4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1e80a8f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1e2ef05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f1e2fe68119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f1e7bc7de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f1e80cc4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f1e80a8f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48d7118897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f48d83f1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f48d83f6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f48d83f7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4923e90e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4928ed7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4928ca2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48d7118897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f48d83f1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f48d83f6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f48d83f7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4923e90e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4928ed7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4928ca2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48d7118897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f48d807b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f4923e90e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f4928ed7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f4928ca2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank40]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank40]: grad_accumulator.backward(sum(activations)) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank40]: result = loss.backward() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank40]: torch.autograd.backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank40]: _engine_run_backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank40]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank40]: return user_fn(self, *args) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank40]: self.grads_buffer.append(recv_grad()) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1376338897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1377611c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1377616a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1377617dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f13c30b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f13c80f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f13c7ec2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1376338897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1377611c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1377616a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1377617dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f13c30b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f13c80f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f13c7ec2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1376338897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f137729b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f13c30b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f13c80f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f13c7ec2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank46]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank46]: grad_accumulator.backward(sum(activations)) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank46]: result = loss.backward() -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank46]: torch.autograd.backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank46]: _engine_run_backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank46]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank46]: return user_fn(self, *args) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank46]: self.grads_buffer.append(recv_grad()) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank44]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank44]: grad_accumulator.backward(sum(activations)) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank44]: result = loss.backward() -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank44]: torch.autograd.backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank44]: _engine_run_backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank44]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank44]: return user_fn(self, *args) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank44]: self.grads_buffer.append(recv_grad()) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6bef12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6c01ebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6c01f0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6c01f1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa70bc8ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa710cd1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa710a9c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6bef12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6c01ebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6c01f0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6c01f1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa70bc8ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa710cd1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa710a9c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6bef12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fa6bfe75119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fa70bc8ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fa710cd1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fa710a9c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcdfbaa7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcdfcd80c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcdfcd85a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcdfcd86dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fce4881fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fce4d866609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b7e029897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b7f302c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b7f307a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b7f308dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1bcada1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1bcfde8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1bcfbb3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #6: clone + 0x43 (0x7fce4d631353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b7e029897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b7f302c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b7f307a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b7f308dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1bcada1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1bcfde8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1bcfbb3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b7e029897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #1: + 0xe32119 (0x7f1b7ef8c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f1bcada1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcdfbaa7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcdfcd80c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcdfcd85a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: + 0x8609 (0x7f1bcfde8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f1bcfbb3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcdfcd86dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fce4881fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fce4d866609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]: -[default2]:frame #6: clone + 0x43 (0x7fce4d631353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcdfbaa7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fcdfca0a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fce4881fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fce4d866609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fce4d631353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f243333c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2434615c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f243461aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f243461bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f24800b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f24850fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2484ec6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f243333c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2434615c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f243461aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f243461bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f24800b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f24850fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2484ec6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f243333c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f243429f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f24800b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f24850fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f2484ec6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb868afc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb869dd5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb869ddaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb869ddbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fb8b5874e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fb8ba8bb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb8ba686353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f69f0401897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f69f16dac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f69f16dfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f69f16e0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb868afc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #4: + 0xd3e95 (0x7f6a3d179e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6a421c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb869dd5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb869ddaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #6: clone + 0x43 (0x7f6a41f8b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb869ddbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:frame #4: + 0xd3e95 (0x7fb8b5874e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:frame #5: + 0x8609 (0x7fb8ba8bb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb8ba686353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f69f0401897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb868afc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fb869a5f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f69f16dac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f69f16dfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fb8b5874e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f29657e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #3: + 0x8609 (0x7fb8ba8bb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f69f16e0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: clone + 0x43 (0x7fb8ba686353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #4: + 0xd3e95 (0x7f6a3d179e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6a421c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6a41f8b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f815fa37897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2966ac1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f69f0401897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]: -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2966ac6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2966ac7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #1: + 0xe32119 (0x7f69f1364119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f6a3d179e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #4: + 0xd3e95 (0x7f29b2560e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f29b75a7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #3: + 0x8609 (0x7f6a421c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8160d10c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8160d15a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8160d16dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #6: clone + 0x43 (0x7f29b7372353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:frame #4: + 0xd3e95 (0x7f81ac7afe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f81b17f6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f6a41f8b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #6: clone + 0x43 (0x7f81b15c1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default6]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f815fa37897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8160d10c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8160d15a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8160d16dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #4: + 0xd3e95 (0x7f81ac7afe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f81b17f6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f81b15c1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f29657e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]: -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2966ac1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2966ac6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2966ac7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f29b2560e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f815fa37897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f816099a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f81ac7afe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f81b17f6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #5: + 0x8609 (0x7f29b75a7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f81b15c1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #6: clone + 0x43 (0x7f29b7372353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]: -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f29657e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f296674b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f29b2560e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f29b75a7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f29b7372353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe32256b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe323844c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe323849a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe32384adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe36f2e3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe37432a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe3740f5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe32256b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe323844c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe323849a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe32384adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe36f2e3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe37432a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe3740f5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe32256b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fe3234ce119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fe36f2e3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fe37432a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fe3740f5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3d2e36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff3d410fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff3d4114a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff3d4115dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff41fbaee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff424bf5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff4249c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3d2e36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff3d410fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff3d4114a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff3d4115dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff41fbaee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff424bf5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff4249c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3d2e36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7ff3d3d99119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7ff41fbaee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7ff424bf5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7ff4249c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feef3a58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feef4d31c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feef4d36a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feef4d37dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fef407d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fef45817609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fef455e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feef3a58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feef4d31c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feef4d36a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feef4d37dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fef407d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fef45817609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fef455e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recen[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -t call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feef3a58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7feef49bb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fef407d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fef45817609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fef455e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02a9c01897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02aaedac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02aaedfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02aaee0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f02f6979e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f02fb9c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f02fb78b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02a9c01897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02aaedac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02aaedfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02aaee0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f02f6979e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f02fb9c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f02fb78b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02a9c01897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f02aab64119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f02f6979e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f02fb9c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f02fb78b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb359cb3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb35af8cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb35af91a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb35af92dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fb3a6a2be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb3aba72609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fb3ab83d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb359cb3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb35af8cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb35af91a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb35af92dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fb3a6a2be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb3aba72609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f107dbe1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f107eebac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f107eebfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f107eec0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f10ca959e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #6: clone + 0x43 (0x7fb3ab83d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb359cb3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fb35ac16119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #5: + 0x8609 (0x7f10cf9a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #2: + 0xd3e95 (0x7fb3a6a2be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fb3aba72609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fb3ab83d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:frame #6: clone + 0x43 (0x7f10cf76b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f107dbe1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f107eebac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f107eebfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f107eec0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f10ca959e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f10cf9a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f10cf76b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f107dbe1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f107eb44119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f10ca959e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f10cf9a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f10cf76b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04d4409897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04d56e2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04d56e7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04d56e8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f0521181e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f05261c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f0525f93353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04d4409897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04d56e2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04d56e7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04d56e8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6023e0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f60250e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f60250e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f60250e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f6070b82e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f6075bc9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f6075994353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #4: + 0xd3e95 (0x7f0521181e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6023e0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f60250e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #5: + 0x8609 (0x7f05261c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f0525f93353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f60250e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f60250e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f6070b82e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f6075bc9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #6: clone + 0x43 (0x7f6075994353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04d4409897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6023e0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f04d536c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f0521181e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #1: + 0xe32119 (0x7f6024d6d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: + 0x8609 (0x7f05261c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f0525f93353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:frame #2: + 0xd3e95 (0x7f6070b82e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f6075bc9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f6075994353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f31e6a29897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f31e7d02c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f31e7d07a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f31e7d08dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f32337a1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f32387e8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f32385b3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f31e6a29897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f31e7d02c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f31e7d07a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f31e7d08dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f32337a1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f32387e8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f32385b3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f31e6a29897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f31e798c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f32337a1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f32387e8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f32385b3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9085add897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9086db6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9086dbba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9086dbcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f90d2855e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f90d789c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f90d7667353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9085add897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9086db6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9086dbba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9086dbcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f90d2855e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f90d789c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f90d7667353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9085add897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f9086a40119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f90d2855e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f90d789c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f90d7667353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5c186be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5c19997c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5c1999ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5c1999ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5c65436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5c6a47d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5c6a248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5c186be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5c19997c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5c1999ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5c1999ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5c65436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5c6a47d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5c6a248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5c186be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f5c19621119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f5c65436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f5c6a47d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f5c6a248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1e377c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1e38a9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1e38aa4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1e38aa5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1e8453ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1e89585609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1e89350353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1e377c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1e38a9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1e38aa4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1e38aa5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1e8453ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1e89585609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1e89350353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1e377c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f1e38729119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f1e8453ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f1e89585609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f1e89350353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd5d413a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd5d5413c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd5d5418a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd5d5419dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd620eb2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd625ef9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd625cc4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd5d413a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd5d5413c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd5d5418a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd5d5419dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd620eb2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd625ef9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd625cc4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd5d413a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fd5d509d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fd620eb2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fd625ef9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fd625cc4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f678373c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6784a15c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6784a1aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6784a1bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f67d04b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f67d54fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f67d52c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f678373c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6784a15c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6784a1aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6784a1bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f67d04b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f67d54fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f67d52c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f678373c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f678469f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f67d04b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f67d54fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f67d52c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f646bc73897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f646cf4cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f646cf51a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f646cf52dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f64b89ebe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f64bda32609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f64bd7fd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f646bc73897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f646cf4cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f646cf51a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f646cf52dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f64b89ebe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f64bda32609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f64bd7fd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f646bc73897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f646cbd6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f64b89ebe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f64bda32609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f64bd7fd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4626106897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f46273dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f46273e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f46273e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4672e7ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f4677ec5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f4677c90353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4626106897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f46273dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f46273e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f46273e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4672e7ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f4677ec5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f4677c90353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4626106897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f4627069119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f4672e7ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f4677ec5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f4677c90353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f768e478897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f768f751c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f768f756a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f768f757dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f76db1f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f76e0237609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f76e0002353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f768e478897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f768f751c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f768f756a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f768f757dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f76db1f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f76e0237609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f76e0002353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f768e478897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f768f3db119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f76db1f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f76e0237609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f76e0002353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa62a371897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa62b64ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa62b64fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa62b650dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa6770e9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa67c130609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa67befb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa62a371897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa62b64ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa62b64fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa62b650dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa6770e9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa67c130609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa67befb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa62a371897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fa62b2d4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fa6770e9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fa67c130609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fa67befb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe8ccf75897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe8ce24ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe8ce253a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe8ce254dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe919cede95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe91ed34609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe91eaff353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe8ccf75897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe8ce24ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe8ce253a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe8ce254dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe919cede95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe91ed34609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe91eaff353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe8ccf75897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fe8cded8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fe919cede95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fe91ed34609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fe91eaff353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f26bdf92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f26bf26bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f26bf270a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f26bf271dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f270ad0ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f270fd51609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f270fb1c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f26bdf92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f26bf26bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f26bf270a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f26bf271dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f270ad0ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f270fd51609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f270fb1c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f26bdf92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f26beef5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f270ad0ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f270fd51609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f270fb1c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -W0703 05:00:24.224000 139925998749504 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 29969 closing signal SIGTERM -E0703 05:00:24.592000 139925998749504 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 29967) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:00:24 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 29968) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 29968 -[2]: - time : 2024-07-03_05:00:24 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 29970) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 29970 -[3]: - time : 2024-07-03_05:00:24 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 29971) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 29971 -[4]: - time : 2024-07-03_05:00:24 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 29972) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 29972 -[5]: - time : 2024-07-03_05:00:24 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 29973) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 29973 -[6]: - time : 2024-07-03_05:00:24 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 29974) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 29974 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:00:24 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 29967) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 29967 -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -W0703 05:00:28.088000 139959585724160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-103.ec2.internal_904313_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:28.501000 139946793449216 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_1180459_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:28.669000 140393350461184 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3800397_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:28.827000 139986505172736 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-138.ec2.internal_700172_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:28.867000 139763264866048 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-153.ec2.internal_1455826_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:28.897000 139870167238400 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_919211_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:28.964000 140246907541248 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3929725_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:29.242000 139768925599552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1455905 closing signal SIGTERM -W0703 05:00:29.256000 139952454182720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1180534 closing signal SIGTERM -W0703 05:00:29.256000 139952454182720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1180535 closing signal SIGTERM -W0703 05:00:29.256000 139952454182720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1180536 closing signal SIGTERM -W0703 05:00:29.256000 139952454182720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1180537 closing signal SIGTERM -W0703 05:00:29.256000 139952454182720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1180538 closing signal SIGTERM -W0703 05:00:29.256000 139952454182720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1180539 closing signal SIGTERM -W0703 05:00:29.256000 139952454182720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1180540 closing signal SIGTERM -W0703 05:00:29.256000 139952454182720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1180541 closing signal SIGTERM -W0703 05:00:29.280000 140252568274752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3929800 closing signal SIGTERM -W0703 05:00:29.280000 140252568274752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3929801 closing signal SIGTERM -W0703 05:00:29.280000 140252568274752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3929802 closing signal SIGTERM -W0703 05:00:29.280000 140252568274752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3929803 closing signal SIGTERM -W0703 05:00:29.280000 140252568274752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3929804 closing signal SIGTERM -W0703 05:00:29.280000 140252568274752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3929805 closing signal SIGTERM -W0703 05:00:29.280000 140252568274752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3929806 closing signal SIGTERM -W0703 05:00:29.285000 140399011194688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3800472 closing signal SIGTERM -W0703 05:00:29.285000 140399011194688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3800473 closing signal SIGTERM -W0703 05:00:29.285000 140399011194688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3800474 closing signal SIGTERM -W0703 05:00:29.285000 140399011194688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3800475 closing signal SIGTERM -W0703 05:00:29.285000 140399011194688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3800476 closing signal SIGTERM -W0703 05:00:29.285000 140399011194688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3800477 closing signal SIGTERM -W0703 05:00:29.286000 140399011194688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3800478 closing signal SIGTERM -W0703 05:00:29.286000 140399011194688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3800479 closing signal SIGTERM -E0703 05:00:29.363000 139875827971904 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 919285) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 05:00:29.365000 139992165906240 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 700248) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 05:00:29.376000 139875827971904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_919211_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:29.379000 139992165906240 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_700172_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:29.408000 139992165906240 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_700172_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:29.411000 139875827971904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_919211_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:29.440000 139992165906240 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_700172_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-138.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 700249) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 700249 -[2]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-138.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 700250) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 700250 -[3]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-138.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 700251) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 700251 -[4]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-138.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 700252) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 700252 -[5]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-138.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 700253) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 700253 -[6]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-138.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 700254) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 700254 -[7]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-138.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 700255) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 700255 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-138.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 700248) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 700248 -============================================================ -W0703 05:00:29.447000 139875827971904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_919211_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:00:29 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 919286) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 919286 -[2]: - time : 2024-07-03_05:00:29 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 919287) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 919287 -[3]: - time : 2024-07-03_05:00:29 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 919288) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 919288 -[4]: - time : 2024-07-03_05:00:29 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 919289) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 919289 -[5]: - time : 2024-07-03_05:00:29 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 919290) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 919290 -[6]: - time : 2024-07-03_05:00:29 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 919291) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 919291 -[7]: - time : 2024-07-03_05:00:29 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 919292) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 919292 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:00:29 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 919285) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 919285 -============================================================ -E0703 05:00:29.452000 139965246457664 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 904388) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 05:00:29.464000 139965246457664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_904313_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 05:00:29.473000 139768925599552 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1455901) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 05:00:29.486000 139768925599552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1455826_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:29.493000 139965246457664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_904313_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:29.523000 139768925599552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1455826_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:29.524000 139965246457664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_904313_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 904389) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904389 -[2]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 904390) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904390 -[3]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 904391) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904391 -[4]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 904392) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904392 -[5]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 904393) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904393 -[6]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 904394) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904394 -[7]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 904395) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904395 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 904388) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904388 -============================================================ -W0703 05:00:29.549000 139768925599552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1455826_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-153.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 1455902) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1455902 -[2]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-153.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 1455903) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1455903 -[3]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-153.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 1455904) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1455904 -[4]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-153.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 1455906) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1455906 -[5]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-153.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 1455907) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1455907 -[6]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-153.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 1455908) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1455908 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:00:29 - host : ip-26-0-161-153.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 1455901) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1455901 -============================================================ -srun: error: ip-26-0-161-138: task 3: Exited with exit code 1 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 4: Exited with exit code 1 -E0703 05:00:31.599000 140252568274752 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3929799) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 05:00:31.612000 140252568274752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3929725_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:31.642000 140252568274752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3929725_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:31.651000 140252568274752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3929725_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:00:29 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 3929799) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3929799 -============================================================ -W0703 05:00:32.171000 140399011194688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3800397_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:32.182000 139952454182720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1180459_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:00:32.184000 140399011194688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3800397_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state -W0703 05:00:32.194000 139952454182720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1180459_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-16/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/bench.slurm deleted file mode 100644 index 0787bc06f6304ee6285798a2c095320da3a28f3a..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/config.yaml b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/config.yaml deleted file mode 100644 index 6fcb074183a94b70686c609cd952836aea6a71df..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 4 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 512 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 2 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/log.out b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/log.out deleted file mode 100644 index 248ed11290e4fe36fe199c5cb9a712a741a6f67e..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/log.out +++ /dev/null @@ -1,5803 +0,0 @@ -======================== -START TIME: Wed Jul 3 05:49:32 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 05:49:39.096000 139910756243264 torch/distributed/run.py:757] -W0703 05:49:39.096000 139910756243264 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.096000 139910756243264 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:49:39.096000 139910756243264 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.143000 140139802732352 torch/distributed/run.py:757] -W0703 05:49:39.143000 140139802732352 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.143000 140139802732352 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:49:39.143000 140139802732352 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.201000 140566140241728 torch/distributed/run.py:757] -W0703 05:49:39.201000 140566140241728 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.201000 140566140241728 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:49:39.201000 140566140241728 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.206000 140322127705920 torch/distributed/run.py:757] -W0703 05:49:39.206000 140322127705920 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.206000 140322127705920 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:49:39.206000 140322127705920 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.261000 140445203048256 torch/distributed/run.py:757] -W0703 05:49:39.261000 140445203048256 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.261000 140445203048256 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:49:39.261000 140445203048256 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.417000 139765080987456 torch/distributed/run.py:757] -W0703 05:49:39.417000 139765080987456 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.417000 139765080987456 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:49:39.417000 139765080987456 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.571000 139765198563136 torch/distributed/run.py:757] -W0703 05:49:39.571000 139765198563136 torch/distributed/run.py:757] ***************************************** -W0703 05:49:39.571000 139765198563136 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:49:39.571000 139765198563136 torch/distributed/run.py:757] ***************************************** -W0703 05:49:41.308000 139840321193792 torch/distributed/run.py:757] -W0703 05:49:41.308000 139840321193792 torch/distributed/run.py:757] ***************************************** -W0703 05:49:41.308000 139840321193792 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:49:41.308000 139840321193792 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 05:50:06 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config: -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: run='%date_%jobid', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: step=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: consumed_train_samples=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: benchmark_csv_path=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ignore_sanity_checks=True), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp=4, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp=16, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp_engine=, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_mode=, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_linear_async_communication=False, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: expert_parallel_size=1), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dtype=torch.bfloat16, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_revision=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_max_length=None), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoint_interval=100000, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: save_initial_state=False, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: resume_checkpoint_path=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: log_level_replica='info', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration_step_info_interval=1), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: train_steps=20, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: micro_batch_size=2, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: batch_accumulation_per_replica=512, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: val_check_interval=-1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_val_batches=0, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_test_batches=0), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta1=0.9, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta2=0.95, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: torch_adam_is_fused=True, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: name='adamW'), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: zero_stage=1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: weight_decay=0.01, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: clip_grad=1.0, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_steps=1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_style='linear', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_style='linear', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_steps=19, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_starting_step=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: min_decay_lr=1e-05)), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: start_training_step=1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_splits='train', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_config_name=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_overwrite_cache=False, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: text_column_name='text'), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_loading_workers=0))], -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2')), -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lighteval=None) -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Model Config: -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272) -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Building model.. -[default0]:07/03/2024 05:50:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Setting PP block ranks... -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=3|ip-26-0-173-202]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=3|ip-26-0-173-202]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=3|ip-26-0-173-202]: No checkpoint path provided. -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=4|ip-26-0-173-202]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=4|ip-26-0-173-202]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=4|ip-26-0-173-202]: No checkpoint path provided. -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=1|ip-26-0-173-202]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=1|ip-26-0-173-202]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=1|ip-26-0-173-202]: No checkpoint path provided. -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Parametrizing model parameters using StandardParametrizator -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=2|ip-26-0-173-202]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=2|ip-26-0-173-202]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=2|ip-26-0-173-202]: No checkpoint path provided. -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=5|ip-26-0-173-202]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=5|ip-26-0-173-202]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=5|ip-26-0-173-202]: No checkpoint path provided. -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=0|ip-26-0-173-202]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=0|ip-26-0-173-202]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=0|ip-26-0-173-202]: No checkpoint path provided. -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=6|ip-26-0-173-202]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=15|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=15|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=15|ip-26-0-174-36]: No checkpoint path provided. -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=9|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=9|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=6|ip-26-0-173-202]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=6|ip-26-0-173-202]: No checkpoint path provided. -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=7|ip-26-0-173-202]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=7|ip-26-0-173-202]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=7|ip-26-0-173-202]: No checkpoint path provided. -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=9|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=13|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=13|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=14|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=13|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=14|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=14|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=8|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=8|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=8|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=11|ip-26-0-173-246]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=11|ip-26-0-173-246]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=11|ip-26-0-173-246]: No checkpoint path provided. -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=15|ip-26-0-173-246]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=15|ip-26-0-173-246]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=8|ip-26-0-173-246]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=8|ip-26-0-173-246]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=15|ip-26-0-173-246]: No checkpoint path provided. -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=8|ip-26-0-173-246]: No checkpoint path provided. -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=12|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=12|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=12|ip-26-0-174-36]: No checkpoint path provided. -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=12|ip-26-0-173-246]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=12|ip-26-0-173-246]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=12|ip-26-0-173-246]: No checkpoint path provided. -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=9|ip-26-0-173-246]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=9|ip-26-0-173-246]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=9|ip-26-0-173-246]: No checkpoint path provided. -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=13|ip-26-0-173-246]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=13|ip-26-0-173-246]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=13|ip-26-0-173-246]: No checkpoint path provided. -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=14|ip-26-0-173-246]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=14|ip-26-0-173-246]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=14|ip-26-0-173-246]: No checkpoint path provided. -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=0|ip-26-0-173-7]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=0|ip-26-0-173-7]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=0|ip-26-0-173-7]: No checkpoint path provided. -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: No checkpoint path provided. -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=5|ip-26-0-173-7]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=5|ip-26-0-173-7]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=5|ip-26-0-173-7]: No checkpoint path provided. -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=1|ip-26-0-173-7]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=1|ip-26-0-173-7]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=1|ip-26-0-173-7]: No checkpoint path provided. -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=7|ip-26-0-173-7]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=7|ip-26-0-173-7]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=7|ip-26-0-173-7]: No checkpoint path provided. -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=10|ip-26-0-173-246]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=10|ip-26-0-173-246]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=2|TP=10|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=2|ip-26-0-173-7]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: No checkpoint path provided. -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=2|ip-26-0-173-7]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=2|ip-26-0-173-7]: No checkpoint path provided. -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=11|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=11|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=11|ip-26-0-174-36]: No checkpoint path provided. -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=3|ip-26-0-173-7]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=3|ip-26-0-173-7]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=3|ip-26-0-173-7]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=6|ip-26-0-173-7]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=6|ip-26-0-173-7]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=4|ip-26-0-173-7]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=4|ip-26-0-173-7]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=4|ip-26-0-173-7]: No checkpoint path provided. -[default6]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=6|ip-26-0-173-7]: No checkpoint path provided. -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=10|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=10|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 05:50:24 [INFO|DP=0|PP=3|TP=10|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 05:50:24 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 05:50:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 05:50:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 05:50:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 0 has 24.8M out of 24.8M (100.00%) params' optimizer states -[default0]:07/03/2024 05:50:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 05:50:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Using `datasets` library -[default0]:07/03/2024 05:50:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:50:28 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:50:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 05:50:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 05:50:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: -[default0]:07/03/2024 05:50:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Start training] datetime: 2024-07-03 05:50:30.187971 | mbs: 2 | grad_accum: 512 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 05:50:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 05:50:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 244.38MiB. Peak allocated 244.38MiB. Peak reserved: 266.00MiB -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=3|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=2|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=5|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=0|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=9|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=13|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=15|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=12|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=11|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=8|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=8|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=13|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=12|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=14|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=9|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=0|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=6|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=10|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=7|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=1|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=11|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=3|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=6|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=4|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=1|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=2|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=4|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=5|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=3|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=7|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=4|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=6|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=7|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=5|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:50:30 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=14|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=10|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:50:30 [WARNING|DP=0|PP=2|TP=1|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:50:30 [WARNING|DP=0|PP=3|TP=2|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:50:30 [WARNING|DP=0|PP=1|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:50:31 [WARNING|DP=0|PP=3|TP=0|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:50:35 [WARNING|DP=0|PP=2|TP=15|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank27]: send_activation() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank27]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank27]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank27]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank27]: dist.send( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank27]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank28]: send_activation() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank28]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank28]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank28]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank28]: dist.send( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank28]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank16]: send_activation() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank16]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank16]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank16]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank16]: dist.send( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank16]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank18]: send_activation() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank18]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank18]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank18]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank18]: dist.send( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank18]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank13]: send_activation() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank13]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank13]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank13]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank13]: dist.send( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank13]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank8]: send_activation() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank8]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank8]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank8]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank8]: dist.send( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank8]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank3]: send_activation() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank3]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank3]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank3]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank3]: dist.send( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank3]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: _engine_run_backward( -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: torch.autograd.backward( -[default7]:[rank7]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: send_activation() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank7]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank7]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank7]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank7]: dist.send( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank7]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank6]: send_activation() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank6]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank6]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank6]: dist.send( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank6]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank21]: send_activation() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank21]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank21]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank21]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank21]: dist.send( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank21]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank17]: send_activation() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank17]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank17]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank17]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank17]: dist.send( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank17]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank12]: send_activation() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank12]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank12]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank12]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank12]: dist.send( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank12]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank2]: send_activation() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank2]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank2]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank2]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank2]: dist.send( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank2]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank0]: send_activation() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank0]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank0]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[ra[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank22]: send_activation() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank22]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank22]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank22]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank22]: dist.send( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank22]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -nk0]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank0]: dist.send( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank0]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank31]: send_activation() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank31]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank31]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank31]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank31]: dist.send( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank31]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank25]: send_activation() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank25]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank25]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank25]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank25]: dist.send( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank25]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank26]: send_activation() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank26]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank26]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank26]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank26]: dist.send( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank26]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank23]: send_activation() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank23]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank23]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank23]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank23]: dist.send( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank23]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank9]: send_activation() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank9]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank9]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank9]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank9]: dist.send( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank9]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank10]: send_activation() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank10]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank10]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank10]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank10]: dist.send( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank10]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank29]: send_activation() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank29]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank29]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank29]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank29]: dist.send( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank29]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank20]: send_activation() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank20]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank20]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank20]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank20]: dist.send( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank20]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff774817897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff775af0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff775af5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff775af6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7ff7c158fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7ff7c65d6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7ff7c63a1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff774817897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff775af0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff775af5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff775af6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7ff7c158fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7ff7c65d6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7ff7c63a1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff774817897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7ff77577a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7ff7c158fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7ff7c65d6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7ff7c63a1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:[rank15]: Traceback (most recent call last): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f572e216897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f572f4efc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f572f4f4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f572f4f5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f577af8ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f577ffd5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:frame #6: clone + 0x43 (0x7f577fda0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank15]: send_activation() -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f572e216897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank15]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f572f4efc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f572f4f4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f572f4f5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank15]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank15]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:frame #4: + 0xd3e95 (0x7f577af8ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f577ffd5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f577fda0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank15]: dist.send( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default0]: -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank15]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f572e216897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f572f179119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f577af8ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f577ffd5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f577fda0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank11]: send_activation() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank11]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank11]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank11]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank11]: dist.send( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank11]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank4]: send_activation() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank4]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank4]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank4]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank4]: dist.send( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank4]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94c66bd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94c7996c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94c799ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94c799cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9513435e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f951847c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f9518247353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94c66bd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94c7996c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94c799ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94c799cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9513435e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f951847c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f9518247353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94c66bd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f94c7620119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f9513435e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f951847c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f9518247353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb3635a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffb37633c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffb37638a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffb37639dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ffb830d2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ffb88119609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ffb87ee4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb3635a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffb37633c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffb37638a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffb37639dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ffb830d2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ffb88119609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ffb87ee4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb3635a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7ffb372bd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7ffb830d2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7ffb88119609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7ffb87ee4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank14]: send_activation() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank14]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank14]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank14]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank14]: dist.send( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank14]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f39c0d40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f39c2019c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f39c201ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f39c201fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3a0dab8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3a12aff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3a128ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f39c0d40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f39c2019c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f39c201ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f39c201fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3a0dab8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3a12aff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3a128ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f39c0d40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f39c1ca3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f3a0dab8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f3a12aff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f3a128ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5adb724897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5adc9fdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5adca02a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5adca03dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5b2849ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5b2d4e3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5b2d2ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5adb724897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5adc9fdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5adca02a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5adca03dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5b2849ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5b2d4e3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5b2d2ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5adb724897 in /fsx/[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f5adc687119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f5b2849ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f5b2d4e3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f5b2d2ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9fa209b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9fa3374c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9fa3379a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9fa337adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f9feee13e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f9ff3e5a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f9ff3c25353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9fa209b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9fa3374c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9fa3379a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9fa337adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f9feee13e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f9ff3e5a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f9ff3c25353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9fa209b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f9fa2ffe119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f9feee13e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f9ff3e5a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f9ff3c25353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank1]: send_activation() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank1]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank1]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank1]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank1]: dist.send( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank1]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2d2ae2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa2d3dbbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa2d3dc0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa2d3dc1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:frame #4: + 0xd3e95 (0x7fa31f85ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank5]: send_activation() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank5]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:frame #5: + 0x8609 (0x7fa3248a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:frame #6: clone + 0x43 (0x7fa32466c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank5]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank5]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank5]: dist.send( -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2d2ae2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa2d3dbbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa2d3dc0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa2d3dc1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank5]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank19]: send_activation() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank19]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank19]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank19]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank19]: dist.send( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank19]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:frame #4: + 0xd3e95 (0x7fa31f85ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa3248a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa32466c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2d2ae2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fa2d3a45119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fa31f85ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fa3248a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fa32466c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc8fb052897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc8fc32bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc8fc330a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc8fc331dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc947dcae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc94ce11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc94cbdc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc8fb052897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc8fc32bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc8fc330a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc8fc331dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc947dcae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc94ce11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc94cbdc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc8fb052897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fc8fbfb5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fc947dcae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fc94ce11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fc94cbdc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank30]: send_activation() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank30]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank30]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank30]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank30]: dist.send( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank30]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f03ada5d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f03aed36c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f03aed3ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f03aed3cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f03fa7d5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f03ff81c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f03ff5e7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f03ada5d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f03aed36c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f03aed3ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f03aed3cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f03fa7d5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f03ff81c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f03ff5e7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f03ada5d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f03ae9c0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f03fa7d5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f03ff81c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f03ff5e7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank24]: send_activation() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank24]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank24]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank24]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank24]: dist.send( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank24]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5df8a02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5df9cdbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5df9ce0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5df9ce1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5e4577ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5e4a7c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5e4a58c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5df8a02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5df9cdbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5df9ce0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5df9ce1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5e4577ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5e4a7c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5e4a58c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5df8a02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f5df9965119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f5e4577ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f5e4a7c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f5e4a58c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a123f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3a136cec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3a136d3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3a136d4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3a5f16de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3a641b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3a63f7f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a123f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3a136cec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3a136d3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3a136d4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3a5f16de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3a641b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3a63f7f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a123f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f3a13358119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f3a5f16de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f3a641b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f3a63f7f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84c66bc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84c7995c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84c799aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84c799bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f8513434e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f851847b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f8518246353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84c66bc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84c7995c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84c799aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84c799bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f8513434e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f851847b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f8518246353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84c66bc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f84c761f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f8513434e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f851847b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f8518246353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feb85e04897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feb870ddc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feb870e2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feb870e3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7febd2b7ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7febd7bc3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7febd798e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feb85e04897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feb870ddc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feb870e2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feb870e3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7febd2b7ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7febd7bc3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7febd798e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feb85e04897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7feb86d67119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7febd2b7ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7febd7bc3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7febd798e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd1d56f7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd1d69d0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd1d69d5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd1d69d6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd22246fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd2274b6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd227281353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd1d56f7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd1d69d0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd1d69d5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd1d69d6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd22246fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd2274b6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd227281353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd1d56f7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd1d665a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd22246fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd2274b6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd227281353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fee26e2f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fee28108c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fee2810da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fee2810edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fee73ba7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fee78bee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fee789b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fee26e2f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fee28108c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fee2810da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fee2810edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fee73ba7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fee78bee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fee789b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fee26e2f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fee27d92119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fee73ba7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fee78bee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fee789b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87909e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8791cc0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8791cc5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8791cc6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f87dd75fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f87e27a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f87e2571353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87909e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8791cc0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8791cc5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8791cc6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f87dd75fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f87e27a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f87e2571353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87909e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f879194a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f87dd75fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f87e27a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f87e2571353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd7274b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd73a24c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd73a29a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd73a2adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fcdbf4c3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fcdc450a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fcdc42d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd7274b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd73a24c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd73a29a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd73a2adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fcdbf4c3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fcdc450a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fcdc42d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd7274b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fcd736ae119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fcdbf4c3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fcdc450a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fcdc42d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a8139f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a82678c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a8267da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a8267edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5ace117e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5ad315e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5ad2f29353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a8139f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a82678c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a8267da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a8267edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5ace117e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5ad315e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5ad2f29353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a8139f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f5a82302119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f5ace117e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f5ad315e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f5ad2f29353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7171b11897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7172deac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7172defa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7172df0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f71be889e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f71c38d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f71c369b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7171b11897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7172deac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7172defa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7172df0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f71be889e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f71c38d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f71c369b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7171b11897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f7172a74119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f71be889e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f71c38d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f71c369b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f9d1a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9f9e47bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9f9e480a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9f9e481dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9fe9f1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9feef61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9feed2c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f9d1a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9f9e47bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9f9e480a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9f9e481dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9fe9f1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9feef61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9feed2c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f9d1a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f9f9e105119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f9fe9f1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f9feef61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f9feed2c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank39]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank39]: grad_accumulator.backward(sum(activations)) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank39]: result = loss.backward() -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: torch.autograd.backward( -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank33]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank39]: _engine_run_backward( -[default1]:[rank33]: grad_accumulator.backward(sum(activations)) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank39]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank33]: result = loss.backward() -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank33]: torch.autograd.backward( -[default7]:[rank39]: return user_fn(self, *args) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank39]: pipeline_state.run_communication() -[default1]:[rank33]: _engine_run_backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank33]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank33]: return user_fn(self, *args) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank39]: self.grads_buffer.append(recv_grad()) -[default1]:[rank33]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: self.grads_buffer.append(recv_grad()) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: return func(*args, **kwargs) -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank38]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank38]: grad_accumulator.backward(sum(activations)) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank38]: result = loss.backward() -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank38]: torch.autograd.backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank38]: _engine_run_backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank38]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank38]: return user_fn(self, *args) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank38]: self.grads_buffer.append(recv_grad()) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank41]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank41]: grad_accumulator.backward(sum(activations)) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank41]: result = loss.backward() -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank41]: torch.autograd.backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank41]: _engine_run_backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank41]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank41]: return user_fn(self, *args) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank41]: self.grads_buffer.append(recv_grad()) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank44]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank44]: grad_accumulator.backward(sum(activations)) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank44]: result = loss.backward() -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank44]: torch.autograd.backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank44]: _engine_run_backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank44]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank44]: return user_fn(self, *args) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank44]: self.grads_buffer.append(recv_grad()) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank43]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank43]: grad_accumulator.backward(sum(activations)) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank43]: result = loss.backward() -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank43]: torch.autograd.backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank43]: _engine_run_backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank43]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank43]: return user_fn(self, *args) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank43]: self.grads_buffer.append(recv_grad()) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: grad_accumulator.backward(sum(activations)) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank32]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank32]: grad_accumulator.backward(sum(activations)) -[default4]:[rank36]: result = loss.backward() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank32]: result = loss.backward() -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank32]: torch.autograd.backward( -[default4]:[rank36]: torch.autograd.backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank36]: _engine_run_backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank32]: _engine_run_backward( -[default4]:[rank36]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank36]: return user_fn(self, *args) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank32]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank32]: return user_fn(self, *args) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank36]: self.grads_buffer.append(recv_grad()) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank32]: pipeline_state.run_communication() -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank32]: self.grads_buffer.append(recv_grad()) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank34]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank34]: grad_accumulator.backward(sum(activations)) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank34]: result = loss.backward() -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank34]: torch.autograd.backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank34]: _engine_run_backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank34]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank34]: return user_fn(self, *args) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank34]: self.grads_buffer.append(recv_grad()) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ff5c12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4ff6eebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4ff6ef0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4ff6ef1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f504298ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f50479d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:frame #6: clone + 0x43 (0x7f504779c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58d93e9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ff5c12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4ff6eebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f58da6c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f58da6c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f58da6c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5926161e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4ff6ef0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4ff6ef1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f504298ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f50479d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f504779c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ff5c12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f4ff6b75119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f504298ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f592b1a8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f592af73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #3: + 0x8609 (0x7f50479d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f504779c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58d93e9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f58da6c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f58da6c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f58da6c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5926161e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f592b1a8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f592af73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58d93e9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f58da34c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f5926161e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f592b1a8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f592af73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3bc3d51897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3bc502ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3bc502fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3bc5030dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3c10ac9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3c15b10609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3c158db353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3bc3d51897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3bc502ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3bc502fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3bc5030dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3c10ac9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3c15b10609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3c158db353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3bc3d51897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f3bc4cb4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f3c10ac9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f3c15b10609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f3c158db353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc55b2c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc56e05c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc56e0aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc56e0bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7efca28a4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7efca78eb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7efca76b6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc55b2c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc56e05c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc56e0aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc56e0bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7efca28a4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7efca78eb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7efca76b6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc55b2c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7efc56a8f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7efca28a4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7efca78eb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7efca76b6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank40]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank40]: grad_accumulator.backward(sum(activations)) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank40]: result = loss.backward() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank40]: torch.autograd.backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank40]: _engine_run_backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank40]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank40]: return user_fn(self, *args) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank40]: self.grads_buffer.append(recv_grad()) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4eb2be2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4eb3ebbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4eb3ec0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4eb3ec1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4eff95ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4f049a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4f0476c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4eb2be2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4eb3ebbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4eb3ec0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4eb3ec1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4eff95ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4f049a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4f0476c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4eb2be2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4eb3b45119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f4eff95ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f4f049a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4f0476c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0acc5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe0adf35c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe0adf3aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe0adf3bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe0f99d4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe0fea1b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe0fe7e6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0acc5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe0adf35c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe0adf3aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe0adf3bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe0f99d4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:frame #5: + 0x8609 (0x7fe0fea1b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default1]:frame #6: clone + 0x43 (0x7fe0fe7e6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f95a0c47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0acc5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f596c289897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f596d562c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f95a1f20c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f596d567a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f596d568dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f95a1f25a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f59b9001e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #1: + 0xe32119 (0x7fe0adbbf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f95a1f26dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fe0f99d4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f59be048609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #3: + 0x8609 (0x7fe0fea1b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: + 0xd3e95 (0x7f95ed9bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #6: clone + 0x43 (0x7f59bde13353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:frame #5: + 0x8609 (0x7f95f2a06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fe0fe7e6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #6: clone + 0x43 (0x7f95f27d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default5]: -[default1]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f95a0c47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f95a1f20c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f596c289897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f596d562c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f596d567a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f596d568dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f59b9001e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f59be048609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f59bde13353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f596c289897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f596d1ec119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f95a1f25a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f59b9001e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f59be048609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f59bde13353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f95a1f26dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f95ed9bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f95f2a06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f95f27d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f95a0c47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f95a1baa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f95ed9bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f95f2a06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f95f27d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f95bf4a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f95c0781c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f95c0786a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f95c0787dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f960c220e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f9611267609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f9611032353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f95bf4a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f95c0781c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f95c0786a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f95c0787dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f960c220e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f9611267609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f9611032353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f95bf4a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f95c040b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f960c220e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f9611267609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f9611032353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank42]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank42]: grad_accumulator.backward(sum(activations)) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank42]: result = loss.backward() -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank42]: torch.autograd.backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank42]: _engine_run_backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank42]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank42]: return user_fn(self, *args) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank42]: self.grads_buffer.append(recv_grad()) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb11db62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb11ee3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb11ee40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb11ee41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb16a8dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb16f921609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb16f6ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb11db62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb11ee3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb11ee40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb11ee41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb16a8dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb16f921609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb16f6ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb11db62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fb11eac5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb16a8dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb16f921609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb16f6ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e609af897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e61c88c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e61c8da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e61c8edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2ead727e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2eb276e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2eb2539353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e609af897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e61c88c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e61c8da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e61c8edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2ead727e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2eb276e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2eb2539353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e609af897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f2e61912119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f2ead727e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f2eb276e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f2eb2539353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank47]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank47]: grad_accumulator.backward(sum(activations)) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank47]: result = loss.backward() -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank47]: torch.autograd.backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank47]: _engine_run_backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank47]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank47]: return user_fn(self, *args) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank47]: self.grads_buffer.append(recv_grad()) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank45]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank45]: grad_accumulator.backward(sum(activations)) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank45]: result = loss.backward() -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank45]: torch.autograd.backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank45]: _engine_run_backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank45]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank45]: return user_fn(self, *args) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank45]: self.grads_buffer.append(recv_grad()) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank37]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank37]: grad_accumulator.backward(sum(activations)) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank37]: result = loss.backward() -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank37]: torch.autograd.backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank37]: _engine_run_backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank37]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank37]: return user_fn(self, *args) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank37]: self.grads_buffer.append(recv_grad()) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank35]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank35]: grad_accumulator.backward(sum(activations)) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank35]: result = loss.backward() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank35]: torch.autograd.backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank35]: _engine_run_backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank35]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank35]: return user_fn(self, *args) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank35]: self.grads_buffer.append(recv_grad()) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank46]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank46]: grad_accumulator.backward(sum(activations)) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank46]: result = loss.backward() -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank46]: torch.autograd.backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank46]: _engine_run_backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank46]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank46]: return user_fn(self, *args) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank46]: self.grads_buffer.append(recv_grad()) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = model(**micro_batch) -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: output = model(**micro_batch) -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2110686897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f211195fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2111964a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2111965dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f215d3fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f2162445609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f2162210353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2110686897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f211195fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2111964a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2111965dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f215d3fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f2162445609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f2162210353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2110686897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f21115e9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f215d3fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f2162445609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f2162210353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3982809897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3983ae2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3983ae7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3983ae8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f39cf581e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f39d45c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f39d4393353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3982809897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3983ae2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3983ae7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3983ae8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f39cf581e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f39d45c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f39d4393353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3982809897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f398376c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f39cf581e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f39d45c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f39d4393353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: output = model(**micro_batch) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdea2dbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdea4097c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdea409ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdea409ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdeefb36e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdef4b7d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdef4948353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdea2dbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdea4097c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdea409ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdea409ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdeefb36e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdef4b7d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdef4948353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdea2dbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fdea3d21119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fdeefb36e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fdef4b7d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fdef4948353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83f7397897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83f8670c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83f8675a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83f8676dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f844410fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f8449156609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f8448f21353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83f7397897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83f8670c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83f8675a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83f8676dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f844410fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f8449156609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f8448f21353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83f7397897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f83f82fa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f844410fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f8449156609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f8448f21353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1efdcb6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1efef8fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1efef94a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1efef95dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1f4aa2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1f4fa75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1f4f840353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1efdcb6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1efef8fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1efef94a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1efef95dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1f4aa2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1f4fa75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1f4f840353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1efdcb6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f1efec19119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f1f4aa2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f1f4fa75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f1f4f840353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4f91972897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4f92c4bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4f92c50a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4f92c51dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4fde6eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4fe3731609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4fe34fc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4f91972897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4f92c4bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4f92c50a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4f92c51dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4fde6eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4fe3731609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4fe34fc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4f91972897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4f928d5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f4fde6eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f4fe3731609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4fe34fc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70b88a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70b9b79c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f70b9b7ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70b9b7fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f7105618e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f710a65f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f710a42a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70b88a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70b9b79c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f70b9b7ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70b9b7fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f7105618e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f710a65f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f710a42a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70b88a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f70b9803119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f7105618e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f710a65f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f710a42a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77f48e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f77f5bbcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f77f5bc1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f77f5bc2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f784165be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f78466a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f784646d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77f48e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f77f5bbcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f77f5bc1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f77f5bc2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f784165be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f78466a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f784646d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77f48e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f77f5846119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f784165be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f78466a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f784646d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f97464f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f97477cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f97477cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f97477d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9793269e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f97982b0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f979807b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f97464f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f97477cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f97477cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f97477d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9793269e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f97982b0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f979807b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f97464f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f9747454119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f9793269e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f97982b0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f979807b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5cc2e78897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5cc4151c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5cc4156a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5cc4157dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5d0fbf0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5d14c37609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5d14a02353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5cc2e78897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5cc4151c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5cc4156a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5cc4157dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5d0fbf0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5d14c37609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5d14a02353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5cc2e78897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f5cc3ddb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f5d0fbf0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f5d14c37609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f5d14a02353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f553276c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5533a45c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5533a4aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5533a4bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f557f4e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f558452b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f55842f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f553276c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5533a45c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5533a4aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5533a4bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f557f4e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f558452b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:frame #6: clone + 0x43 (0x7f55842f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f553276c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f55336cf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f557f4e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f558452b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f55842f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd5186b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd51998bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd519990a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd519991dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd56542ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd56a471609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd56a23c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd5186b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd51998bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd519990a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd519991dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd56542ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd56a471609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd56a23c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd5186b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fd519615119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fd56542ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fd56a471609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fd56a23c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f26f074a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f26f1a23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f26f1a28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f26f1a29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f273d4c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2742509609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f27422d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f26f074a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f26f1a23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f26f1a28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f26f1a29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f273d4c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2742509609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f27422d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f26f074a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f26f16ad119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f273d4c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f2742509609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f27422d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f821f9ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8220cc7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8220ccca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8220ccddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f826c766e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f82717ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8271578353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f821f9ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8220cc7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8220ccca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8220ccddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f826c766e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f82717ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8271578353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f821f9ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f8220951119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f826c766e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f82717ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f8271578353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3171b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb31848dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb318492a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb318493dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fb363f2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fb368f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fb368d3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3171b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb31848dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb318492a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb318493dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fb363f2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fb368f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fb368d3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3171b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fb318117119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fb363f2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fb368f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fb368d3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06beb1f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f06bfdf8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f06bfdfda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f06bfdfedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f070b897e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f07108de609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f07106a9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06beb1f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f06bfdf8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f06bfdfda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f06bfdfedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f070b897e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f07108de609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f07106a9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06beb1f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f06bfa82119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f070b897e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f07108de609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f07106a9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff75d118897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff75e3f1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff75e3f6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff75e3f7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7ff7a9e90e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7ff7aeed7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7ff7aeca2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff75d118897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff75e3f1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff75e3f6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff75e3f7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7ff7a9e90e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7ff7aeed7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7ff7aeca2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff75d118897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7ff75e07b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7ff7a9e90e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7ff7aeed7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7ff7aeca2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4408f8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f440a263c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f440a268a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f440a269dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4455d02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f445ad49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f445ab14353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4408f8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f440a263c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f440a268a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f440a269dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4455d02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f445ad49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f445ab14353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4408f8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f4409eed119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f4455d02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f445ad49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f445ab14353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e6bb59897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e6ce32c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e6ce37a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e6ce38dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f4eb88d1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4ebd918609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f4ebd6e3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e6bb59897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e6ce32c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e6ce37a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e6ce38dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f4eb88d1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4ebd918609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f4ebd6e3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e6bb59897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f4e6cabc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f4eb88d1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f4ebd918609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f4ebd6e3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff0ba821897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff0bbafac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff0bbaffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff0bbb00dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff107599e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff10c5e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff10c3ab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff0ba821897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff0bbafac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff0bbaffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff0bbb00dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff107599e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff10c5e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff10c3ab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff0ba821897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7ff0bb784119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7ff107599e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7ff10c5e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7ff10c3ab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faefae37897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faefc110c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faefc115a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faefc116dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7faf47bafe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7faf4cbf6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7faf4c9c1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faefae37897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faefc110c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faefc115a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faefc116dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7faf47bafe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7faf4cbf6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7faf4c9c1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faefae37897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7faefbd9a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7faf47bafe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7faf4cbf6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7faf4c9c1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e5cc8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e5df63c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e5df68a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e5df69dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2ea9a02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2eaea49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2eae814353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e5cc8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e5df63c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e5df68a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e5df69dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2ea9a02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2eaea49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2eae814353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e5cc8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f2e5dbed119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f2ea9a02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f2eaea49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f2eae814353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f938f961897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9390c3ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9390c3fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9390c40dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f93dc6d9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f93e1720609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f93e14eb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f938f961897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9390c3ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9390c3fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9390c40dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f93dc6d9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f93e1720609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f93e14eb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f938f961897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f93908c4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f93dc6d9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f93e1720609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f93e14eb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc41813f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc419418c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc41941da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc41941edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc464eb7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc469efe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc469cc9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc41813f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc419418c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc41941da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc41941edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc464eb7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc469efe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc469cc9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc41813f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fc4190a2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fc464eb7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fc469efe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fc469cc9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f018591b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0186bf4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0186bf9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0186bfadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f01d2693e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f01d76da609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f01d74a5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f018591b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd4983be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0186bf4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0186bf9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0186bfadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f01d2693e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f01d76da609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f01d74a5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f018591b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd499697c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd49969ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #1: + 0xe32119 (0x7f018687e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f01d2693e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f01d76da609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f01d74a5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd49969ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd4e5136e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd4ea17d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd4e9f48353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd4983be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd499697c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd49969ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd49969ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd4e5136e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd4ea17d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd4e9f48353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd4983be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fd499321119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fd4e5136e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fd4ea17d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fd4e9f48353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4fb508897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb4fc7e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb4fc7e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb4fc7e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fb548280e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fb54d2c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb54d092353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4fb508897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb4fc7e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb4fc7e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb4fc7e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fb548280e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fb54d2c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb54d092353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4fb508897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fb4fc46b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fb548280e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fb54d2c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fb54d092353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f57e4aaf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f57e5d88c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f57e5d8da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f57e5d8edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5831827e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f583686e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5836639353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f57e4aaf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f57e5d88c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f57e5d8da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f57e5d8edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5831827e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f583686e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5836639353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f57e4aaf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f57e5a12119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f5831827e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f583686e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f5836639353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f49ead2a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f49ec003c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f49ec008a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f49ec009dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4a37aa2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4a3cae9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4a3c8b4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f49ead2a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f49ec003c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f49ec008a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f49ec009dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4a37aa2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4a3cae9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4a3c8b4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f49ead2a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f49ebc8d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f4a37aa2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f4a3cae9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f4a3c8b4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff488943897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff489c1cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff489c21a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff489c22dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff4d56bbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff4da702609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff4da4cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff488943897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff489c1cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff489c21a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff489c22dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff4d56bbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff4da702609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff4da4cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff488943897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7ff4898a6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7ff4d56bbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7ff4da702609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7ff4da4cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e8df3f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdbd98c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8e8f218c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8e8f21da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8e8f21edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8edacb7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8edfcfe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8edfac9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdbdaba2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdbdaba7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdbdaba8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default3]:frame #4: + 0xd3e95 (0x7fdc26641e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdc2b688609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdc2b453353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e8df3f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8e8f218c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8e8f21da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8e8f21edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8edacb7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdbd98c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdbdaba2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #5: + 0x8609 (0x7f8edfcfe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdbdaba7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #6: clone + 0x43 (0x7f8edfac9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdbdaba8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdc26641e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]: -[default3]:frame #5: + 0x8609 (0x7fdc2b688609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e8df3f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f8e8eea2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #6: clone + 0x43 (0x7fdc2b453353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #2: + 0xd3e95 (0x7f8edacb7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #3: + 0x8609 (0x7f8edfcfe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdbd98c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #4: clone + 0x43 (0x7f8edfac9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #1: + 0xe32119 (0x7fdbda82c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fdc26641e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]: -[default3]:frame #3: + 0x8609 (0x7fdc2b688609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fdc2b453353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -W0703 06:01:38.152000 140139802732352 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 924449 closing signal SIGTERM -E0703 06:01:38.268000 140445203048256 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 822152) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 06:01:38.287000 140566140241728 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1689406) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 06:01:38.309000 139910756243264 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 865069) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 06:01:38.315000 139765198563136 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 435034) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:01:38 - host : ip-26-0-163-147.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 822153) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 822153 -[2]: - time : 2024-07-03_06:01:38 - host : ip-26-0-163-147.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 822154) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 822154 -[3]: - time : 2024-07-03_06:01:38 - host : ip-26-0-163-147.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 822155) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 822155 -[4]: - time : 2024-07-03_06:01:38 - host : ip-26-0-163-147.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 822156) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 822156 -[5]: - time : 2024-07-03_06:01:38 - host : ip-26-0-163-147.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 822157) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 822157 -[6]: - time : 2024-07-03_06:01:38 - host : ip-26-0-163-147.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 822158) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 822158 -[7]: - time : 2024-07-03_06:01:38 - host : ip-26-0-163-147.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 822159) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 822159 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:01:38 - host : ip-26-0-163-147.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 822152) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 822152 -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:01:38 - host : ip-26-0-162-233.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1689407) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1689407 -[2]: - time : 2024-07-03_06:01:38 - host : ip-26-0-162-233.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1689408) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1689408 -[3]: - time : 2024-07-03_06:01:38 - host : ip-26-0-162-233.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1689409) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1689409 -[4]: - time : 2024-07-03_06:01:38 - host : ip-26-0-162-233.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1689410) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1689410 -[5]: - time : 2024-07-03_06:01:38 - host : ip-26-0-162-233.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1689411) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1689411 -[6]: - time : 2024-07-03_06:01:38 - host : ip-26-0-162-233.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1689412) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1689412 -[7]: - time : 2024-07-03_06:01:38 - host : ip-26-0-162-233.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1689413) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1689413 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:01:38 - host : ip-26-0-162-233.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1689406) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1689406 -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:01:38 - host : ip-26-0-174-36.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 865070) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 865070 -[2]: - time : 2024-07-03_06:01:38 - host : ip-26-0-174-36.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 865071) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 865071 -[3]: - time : 2024-07-03_06:01:38 - host : ip-26-0-174-36.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 865072) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 865072 -[4]: - time : 2024-07-03_06:01:38 - host : ip-26-0-174-36.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 865073) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 865073 -[5]: - time : 2024-07-03_06:01:38 - host : ip-26-0-174-36.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 865074) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 865074 -[6]: - time : 2024-07-03_06:01:38 - host : ip-26-0-174-36.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 865075) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 865075 -[7]: - time : 2024-07-03_06:01:38 - host : ip-26-0-174-36.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 865076) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 865076 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:01:38 - host : ip-26-0-174-36.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 865069) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 865069 -============================================================ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:01:38 - host : ip-26-0-164-207.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 435035) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 435035 -[2]: - time : 2024-07-03_06:01:38 - host : ip-26-0-164-207.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 435036) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 435036 -[3]: - time : 2024-07-03_06:01:38 - host : ip-26-0-164-207.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 435037) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 435037 -[4]: - time : 2024-07-03_06:01:38 - host : ip-26-0-164-207.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 435038) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 435038 -[5]: - time : 2024-07-03_06:01:38 - host : ip-26-0-164-207.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 435039) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 435039 -[6]: - time : 2024-07-03_06:01:38 - host : ip-26-0-164-207.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 435040) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 435040 -[7]: - time : 2024-07-03_06:01:38 - host : ip-26-0-164-207.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 435041) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 435041 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:01:38 - host : ip-26-0-164-207.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 435034) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 435034 -============================================================ -E0703 06:01:38.570000 140139802732352 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 924450) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:01:38.587000 140139802732352 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_924375_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:01:38.614000 140139802732352 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_924375_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:01:38.638000 140139802732352 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_924375_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:01:38 - host : ip-26-0-165-24.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 924451) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 924451 -[2]: - time : 2024-07-03_06:01:38 - host : ip-26-0-165-24.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 924452) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 924452 -[3]: - time : 2024-07-03_06:01:38 - host : ip-26-0-165-24.ec2.internal - rank : 28 (local_rank: 4) - exitcode : -6 (pid: 924453) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 924453 -[4]: - time : 2024-07-03_06:01:38 - host : ip-26-0-165-24.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 924454) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 924454 -[5]: - time : 2024-07-03_06:01:38 - host : ip-26-0-165-24.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 924455) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 924455 -[6]: - time : 2024-07-03_06:01:38 - host : ip-26-0-165-24.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 924456) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 924456 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:01:38 - host : ip-26-0-165-24.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 924450) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 924450 -============================================================ -srun: error: ip-26-0-174-36: task 7: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 2: Exited with exit code 1 -srun: error: ip-26-0-163-147: task 1: Exited with exit code 1 -srun: error: ip-26-0-162-233: task 0: Exited with exit code 1 -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -W0703 06:01:41.912000 139834660460288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-7.ec2.internal_2055570_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:01:41.965000 139759420253952 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-202.ec2.internal_1332746_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:01:42.866000 140316466972416 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-246.ec2.internal_351256_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 06:01:43.296000 139765080987456 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1332820) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:01:43.308000 139765080987456 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1332746_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:01:43.338000 139765080987456 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1332746_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 06:01:43.365000 140322127705920 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 351333) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:01:43.369000 139765080987456 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1332746_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-202.ec2.internal - rank : 33 (local_rank: 1) - exitcode : -6 (pid: 1332821) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1332821 -[2]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-202.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 1332822) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1332822 -[3]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-202.ec2.internal - rank : 35 (local_rank: 3) - exitcode : -6 (pid: 1332823) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1332823 -[4]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-202.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 1332824) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1332824 -[5]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-202.ec2.internal - rank : 37 (local_rank: 5) - exitcode : -6 (pid: 1332825) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1332825 -[6]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-202.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 1332826) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1332826 -[7]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-202.ec2.internal - rank : 39 (local_rank: 7) - exitcode : -6 (pid: 1332827) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1332827 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-202.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 1332820) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1332820 -============================================================ -W0703 06:01:43.376000 140322127705920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_351256_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:01:43.404000 140322127705920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_351256_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 06:01:43.410000 139840321193792 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 2055646) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:01:43.424000 139840321193792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-7.ec2.internal_2055570_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:01:43.432000 140322127705920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_351256_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-246.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 351334) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 351334 -[2]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-246.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 351335) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 351335 -[3]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-246.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 351336) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 351336 -[4]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-246.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 351337) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 351337 -[5]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-246.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 351338) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 351338 -[6]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-246.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 351339) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 351339 -[7]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-246.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 351340) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 351340 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-246.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 351333) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 351333 -============================================================ -W0703 06:01:43.456000 139840321193792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-7.ec2.internal_2055570_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:01:43.485000 139840321193792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-7.ec2.internal_2055570_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-7.ec2.internal - rank : 49 (local_rank: 1) - exitcode : -6 (pid: 2055647) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2055647 -[2]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-7.ec2.internal - rank : 50 (local_rank: 2) - exitcode : -6 (pid: 2055648) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2055648 -[3]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-7.ec2.internal - rank : 51 (local_rank: 3) - exitcode : -6 (pid: 2055649) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2055649 -[4]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-7.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 2055650) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2055650 -[5]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-7.ec2.internal - rank : 53 (local_rank: 5) - exitcode : -6 (pid: 2055651) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2055651 -[6]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-7.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 2055652) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2055652 -[7]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-7.ec2.internal - rank : 55 (local_rank: 7) - exitcode : -6 (pid: 2055653) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2055653 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:01:43 - host : ip-26-0-173-7.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 2055646) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2055646 -============================================================ -srun: error: ip-26-0-173-246: task 6: Exited with exit code 1 -srun: error: ip-26-0-173-202: task 5: Exited with exit code 1 -srun: error: ip-26-0-173-7: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/status.txt b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-2/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/bench.slurm deleted file mode 100644 index 9b4147ddd95328a14cf3900f96d184453f1780e9..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/config.yaml b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/config.yaml deleted file mode 100644 index 77774dd07e09a4ee3e6f82e248716e667b2c5820..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 4 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 4 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 256 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/log.out b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/log.out deleted file mode 100644 index afd40da4946f0c1bf549557dc9f147ddf08f1be1..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/log.out +++ /dev/null @@ -1,7229 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:17:36 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:17:39.739000 140703858952000 torch/distributed/run.py:757] -W0703 03:17:39.739000 140703858952000 torch/distributed/run.py:757] ***************************************** -W0703 03:17:39.739000 140703858952000 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:17:39.739000 140703858952000 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.851000 140665481819968 torch/distributed/run.py:757] -W0703 03:17:42.851000 140665481819968 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.851000 140665481819968 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:17:42.851000 140665481819968 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.857000 140598708508480 torch/distributed/run.py:757] -W0703 03:17:42.857000 140598708508480 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.857000 140598708508480 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:17:42.857000 140598708508480 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.865000 140672604792640 torch/distributed/run.py:757] -W0703 03:17:42.865000 140672604792640 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.865000 140672604792640 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:17:42.865000 140672604792640 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.867000 139993479001920 torch/distributed/run.py:757] -W0703 03:17:42.867000 139993479001920 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.867000 139993479001920 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:17:42.867000 139993479001920 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.888000 139850509760320 torch/distributed/run.py:757] -W0703 03:17:42.888000 139850509760320 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.888000 139850509760320 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:17:42.888000 139850509760320 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.981000 140661720811328 torch/distributed/run.py:757] -W0703 03:17:42.981000 140661720811328 torch/distributed/run.py:757] ***************************************** -W0703 03:17:42.981000 140661720811328 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:17:42.981000 140661720811328 torch/distributed/run.py:757] ***************************************** -W0703 03:17:43.280000 139967333427008 torch/distributed/run.py:757] -W0703 03:17:43.280000 139967333427008 torch/distributed/run.py:757] ***************************************** -W0703 03:17:43.280000 139967333427008 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:17:43.280000 139967333427008 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:18:07 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config: -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: run='%date_%jobid', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: step=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: consumed_train_samples=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: benchmark_csv_path=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp=4, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp=16, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp_engine=, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_mode=, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: expert_parallel_size=1), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50272), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_revision=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_max_length=None), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoint_interval=100000, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: save_initial_state=False, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: log_level_replica='info', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: train_steps=20, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: micro_batch_size=256, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: batch_accumulation_per_replica=4, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: val_check_interval=-1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_val_batches=0, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_test_batches=0), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta1=0.9, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta2=0.95, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: name='adamW'), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: zero_stage=1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: weight_decay=0.01, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: clip_grad=1.0, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_steps=1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_style='linear', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_style='linear', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_steps=19, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: start_training_step=1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_splits='train', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: text_column_name='text'), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_loading_workers=0))], -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256')), -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lighteval=None) -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Model Config: -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50272) -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Building model.. -[default0]:07/03/2024 03:18:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Setting PP block ranks... -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=12|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=8|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=8|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=12|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=6|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=6|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=10|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=15|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=10|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=9|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=10|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=9|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=9|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=14|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=14|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=15|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=15|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=11|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=6|ip-26-0-166-125]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=6|ip-26-0-166-125]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=6|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=0|ip-26-0-166-125]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: No checkpoint path provided. -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=0|ip-26-0-166-125]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=0|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: No checkpoint path provided. -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=1|ip-26-0-166-125]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=1|ip-26-0-166-125]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=4|ip-26-0-166-125]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=4|ip-26-0-166-125]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=4|ip-26-0-166-125]: No checkpoint path provided. -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=1|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=7|ip-26-0-166-125]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=7|ip-26-0-166-125]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=7|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=14|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=13|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=13|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=5|ip-26-0-166-125]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=13|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=5|ip-26-0-166-125]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=5|ip-26-0-166-125]: No checkpoint path provided. -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=3|ip-26-0-166-125]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=3|ip-26-0-166-125]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=3|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=8|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=11|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=11|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=5|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=5|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=12|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Parametrizing model parameters using StandardParametrizator -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=4|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=4|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=7|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=7|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=3|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=2|ip-26-0-166-125]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=2|ip-26-0-166-125]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:18:24 [INFO|DP=0|PP=2|TP=2|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 03:18:24 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 03:18:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:18:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:18:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] DP Rank 0 has 24.8M out of 24.8M (100.00%) params' optimizer states -[default0]:07/03/2024 03:18:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:18:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Using `datasets` library -[default0]:07/03/2024 03:18:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:18:28 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:18:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:18:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:18:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: -[default0]:07/03/2024 03:18:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Start training] datetime: 2024-07-03 03:18:29.263250 | mbs: 256 | grad_accum: 4 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:18:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:18:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Memory usage: 244.38MiB. Peak allocated 244.38MiB. Peak reserved: 266.00MiB -[default3]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=11|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=10|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=12|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=4|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=5|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=3|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=5|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=1|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=3|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=7|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=6|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=6|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=8|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=15|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=0|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=7|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=1|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=1|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=0|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=2|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=7|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=2|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=5|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=2|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=3|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=13|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=14|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=4|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=9|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=6|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:18:29 [WARNING|DP=0|PP=1|TP=4|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:18:29 [WARNING|DP=0|PP=2|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:18:29 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:18:29 [WARNING|DP=0|PP=3|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:18:30 [WARNING|DP=0|PP=2|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:[rank0]: Traceback (most recent call last): -[default7]:[rank7]: Traceback (most recent call last): -[default4]:[rank12]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank0]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: sharded_logits = self.model( -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: output = self.o_proj(attention_output) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.14 GiB is free. Including non-PyTorch memory, this process has 78.18 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: output = self.o_proj(attention_output) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: output = self.o_proj(attention_output) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: output = self.o_proj(attention_output) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank8]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: return row_linear( -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 567.94 MiB is free. Including non-PyTorch memory, this process has 78.76 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 655.94 MiB is free. Including non-PyTorch memory, this process has 78.68 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/na[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -notron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default3]:[rank3]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/pytho[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -n3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[defaul[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[defaultt3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/ forward -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank3]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: return row_linear( -[default3]:[rank3]: File "/fsx/ferdinandmom/f[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -erdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.14 GiB is free. Including non-PyTorch memory, this process has 78.18 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 567.94 MiB is free. Including non-PyTorch memory, this process has 78.76 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: output = self.o_proj(attention_output) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 655.94 MiB is free. Including non-PyTorch memory, this process has 78.68 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank10]: output = self.o_proj(attention_output) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 567.94 MiB is free. Including non-PyTorch memory, this process has 78.76 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default2]:[rank2]: return row_linear( -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.23 GiB is free. Including non-PyTorch memory, this process has 78.09 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.23 GiB is free. Including non-PyTorch memory, this process has 78.09 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.14 GiB is free. Including non-PyTorch memory, this process has 78.18 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 655.94 MiB is free. Including non-PyTorch memory, this process has 78.68 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: output = self.o_proj(attention_output) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 655.94 MiB is free. Including non-PyTorch memory, this process has 78.68 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: sharded_logits = self.model( -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.23 GiB is free. Including non-PyTorch memory, this process has 78.09 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.14 GiB is free. Including non-PyTorch memory, this process has 78.18 GiB memory in use. Of the allocated memory 68.59 GiB is allocated by PyTorch, and 457.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: Traceback (most recent call last): -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: sharded_logits = self.model( -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: trainer.train(dataloader) -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank26]: output = model(**micro_batch) -[default3]:[rank59]: Traceback (most recent call last): -[default7]:[rank55]: Traceback (most recent call last): -[default0]:[rank48]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: trainer.train(dataloader) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: output = model(**micro_batch) -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: output = model(**micro_batch) -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: output = model(**micro_batch) -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank28]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank59]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank27]: sharded_logits = self.model( -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15c8371897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: frame #1: + 0x5b3a23e (0x7f1601e8e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f1601e88c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1601e88f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank25]: pipeline_state.run_communication() -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1601e89fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1601e3e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1601e3e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1601e3e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1601e3e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f15c964b189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f15c9652610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f15c9671978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: frame #12: + 0x5adc309 (0x7f1601e30309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #13: + 0x5ae6f10 (0x7f1601e3af10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: frame #14: + 0x5ae6fa5 (0x7f1601e3afa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #15: + 0x5124446 (0x7f1601478446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: frame #16: + 0x1acf4b8 (0x7f15fde234b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: frame #17: + 0x5aee004 (0x7f1601e42004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: frame #18: + 0x5af36b5 (0x7f1601e476b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: frame #19: + 0xd2631e (0x7f1614a3131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #20: + 0x47def4 (0x7f1614188ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: frame #21: + 0x1445a6 (0x555648c535a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555648c4ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: recv_activation_tensor = recv_activation() -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank59]: frame #23: + 0x150866 (0x555648c5f866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555648c48142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank59]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555648c53a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: frame #26: PyObject_Call + 0xbc (0x555648c5ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default4]:[rank60]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0038faa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555648c462b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555648c53a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555648c448fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: frame #30: + 0x150582 (0x555648c5f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #1: + 0x5b3a23e (0x7f0072ac723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank59]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555648c448fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: frame #32: + 0x150582 (0x555648c5f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555648c448fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0072ac1c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0072ac1f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: frame #34: + 0x150582 (0x555648c5f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555648c448fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555648c4bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555648c5dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #38: + 0x211239 (0x555648d20239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0072ac2fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0072a77371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0072a77371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0072a77371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0072a77371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555648c4ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555648c483e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555648c53a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555648c43c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555648c53a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank59]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555648c448fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f003a284189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f003a28b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f003a2aa978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #12: + 0x5adc309 (0x7f0072a69309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #13: + 0x5ae6f10 (0x7f0072a73f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #45: + 0x150582 (0x555648c5f582 in /fsx/ferdinandmom/miniforge3/envs/env-[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -bench-cluster/bin/python3.10) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: dist.recv( -[default3]:[rank59]: frame #46: PyObject_Call + 0xbc (0x555648c5ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555648c462b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #48: + 0x150582 (0x555648c5f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #49: PyObject_Call + 0xbc (0x555648c5ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555648c462b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555648c53a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555648c4c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555648c5dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default4]:[rank60]: frame #14: + 0x5ae6fa5 (0x7f0072a73fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: frame #15: + 0x5124446 (0x7f00720b1446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank26]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: frame #54: + 0x211239 (0x555648d20239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: return func(*args, **kwargs) -[default0]:[rank24]: pipeline_state.run_communication() -[default7]:[rank55]: sharded_logits = self.model( -[default4]:[rank60]: frame #16: + 0x1acf4b8 (0x7f006ea5c4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #17: + 0x5aee004 (0x7f0072a7b004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #18: + 0x5af36b5 (0x7f0072a806b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: frame #19: + 0xd2631e (0x7f008566a31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #20: + 0x47def4 (0x7f0084dc1ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank24]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default4]:[rank60]: frame #21: + 0x1445a6 (0x55c5fc92a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c5fc923a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #23: + 0x150866 (0x55c5fc936866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default3]:[rank59]: frame #55: PyObject_Call + 0x207 (0x555648c60067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555648c462b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: frame #57: + 0x150582 (0x555648c5f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555648c448fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: frame #59: + 0x150582 (0x555648c5f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #60: PyObject_Call + 0xbc (0x555648c5ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555648c462b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #62: + 0x150582 (0x555648c5f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: return func(*args, **kwargs) -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: frame #63: PyObject_Call + 0xbc (0x555648c5ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c5fc91f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default4]:[rank60]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c5fc92aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: frame #26: PyObject_Call + 0xbc (0x55c5fc936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5fc91d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank60]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c5fc92aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c5fc91b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #30: + 0x150582 (0x55c5fc936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c5fc91b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank60]: frame #32: + 0x150582 (0x55c5fc936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c5fc91b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #34: + 0x150582 (0x55c5fc936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[defaultron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c5fc91b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c5fc922f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c5fc934c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #38: + 0x211239 (0x55c5fc9f7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/p[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43][default4]:[rank60]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c5fc923a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c5fc91f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c5fc92aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c5fc91ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -arallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel[default4]:[rank60]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c5fc92aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c5fc91b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #45: + 0x150582 (0x55c5fc936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py[default4]:[rank60]: frame #46: PyObject_Call + 0xbc (0x55c5fc936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5fc91d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #48: + 0x150582 (0x55c5fc936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #49: PyObject_Call + 0xbc (0x55c5fc936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5fc91d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c5fc92aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c5fc923007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/[default1]:[rank25]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default7]:[rank39]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank39]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc40f8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank39]: frame #1: + 0x5b3a23e (0x7fdc7aaa723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fdc7aaa1c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39", line 246, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -bin/python3.10) -[default4]:[rank60]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c5fc934c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #54: + 0x211239 (0x55c5fc9f7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fdc7aaa1f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fdc7aaa2fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdc7aa57371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdc7aa57371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdc7aa57371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[defau[default3]:[rank43]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default3]:[rank43]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank43]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc05c9bb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank43]: frame #1: + 0x5b3a23e (0x7fc0964d823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fc0964d2c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #55: PyObject_Call + 0x207 (0x55c5fc937067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5fc91d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #57: + 0x150582 (0x55c5fc936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c5fc91b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #59: + 0x150582 (0x55c5fc936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #60: PyObject_Call + 0xbc (0x55c5fc936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5fc91d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bi[default2]:[rank26]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank48]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default0]:[rank48]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -lt7]:[rank39]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdc7aa57371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fdc42264189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fc0964d2f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fc0964d3fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc096488371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc096488371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc096488371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libto[default3]:[rank27]: dist.recv( -[default1]:[rank25]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa0d62c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c4196d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank48]: frame #1: + 0x5b3a23e (0x7f7c7b48a23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fdc4226b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fdc4228a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #12: + 0x5adc309 (0x7fdc7aa49309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #13: + 0x5ae6f10 (0x7fdc7aa53f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -rch_cpu.so) -[default3]:[rank43]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc096488371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fc05dc95189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fc05dc9c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fc05dcbb978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: [default2]:[rank26]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: frame #14: + 0x5ae6fa5 (0x7fdc7aa53fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #15: + 0x5124446 (0x7fdc7a091446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -frame #12: + 0x5adc309 (0x7fc09647a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #13: + 0x5ae6f10 (0x7fc096484f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f7c7b484c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7c7b484f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: frame #16: + 0x1acf4b8 (0x7fdc76a3c4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #17: + 0x5aee004 (0x7fdc7aa5b004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #14: + 0x5ae6fa5 (0x7fc096484fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: frame #1: + 0x5b3a23e (0x7fa10fdde23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank39]: frame #18: + 0x5af36b5 (0x7fdc7aa606b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #19: + 0xd2631e (0x7fdc8d64a31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #20: + 0x47def4 (0x7fdc8cda1ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #15: + 0x5124446 (0x7fc095ac2446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #16: + 0x1acf4b8 (0x7fc09246d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #17: + 0x5aee004 (0x7fc09648c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #18: + 0x5af36b5 (0x7fc0964916b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #19: + 0xd2631e (0x7fc0a907b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -n/python3.10) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank39]: frame #21: + 0x1445a6 (0x560e75b1b5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #22: _PyObject_MakeTpCall + 0x26b (0x560e75b14a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #23: + 0x150866 (0x560e75b27866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #20: + 0x47def4 (0x7fc0a87d2ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #21: + 0x1445a6 (0x55e94db8b5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #62: + 0x150582 (0x55c5fc936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #63: PyObject_Call + 0xbc (0x55c5fc936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank27]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x560e75b10142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #25: _PyFunction_Vectorcall + 0x6c (0x560e75b1ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55e94db84a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #23: + 0x150866 (0x55e94db97866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55e94db80142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa10fdd8c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank39]: frame #26: PyObject_Call + 0xbc (0x560e75b27f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x560e75b0e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #28: _PyFunction_Vectorcall + 0x6c (0x560e75b1ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55e94db8ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #26: PyObject_Call + 0xbc (0x55e94db97f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x560e75b0c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #30: + 0x150582 (0x560e75b27582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x560e75b0c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55e94db7e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55e94db8ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55e94db7c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #30: + 0x150582 (0x55e94db97582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55e94db7c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: dist.recv( -[default2]:[rank26]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c0e80e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: frame #32: + 0x150582 (0x560e75b27582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x560e75b0c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #34: + 0x150582 (0x560e75b27582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x560e75b0c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x560e75b13f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #37: _PyObject_Call_Prepend + 0x69 (0x560e75b25c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #38: + 0x211239 (0x560e75be8239 in /fsx/ferdinandmom/miniforge3/envs/e[default3]:[rank43]: frame #32: + 0x150582 (0x55e94db97582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55e94db7c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7c7b485fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -nv-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #39: _PyObject_MakeTpCall + 0x26b (0x560e75b14a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x560e75b103e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #34: + 0x150582 (0x55e94db97582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55e94db7c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55e94db83f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default7]:[rank55]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank39]: frame #41: _PyFunction_Vectorcall + 0x6c (0x560e75b1ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x560e75b0bc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #43: _PyFunction_Vectorcall + 0x6c (0x560e75b1ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55e94db95c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #38: + 0x211239 (0x55e94dc58239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55e94db84a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55e94db803e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #1: + 0x5b3a23e (0x7f3c4832b23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x560e75b0c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #45: + 0x150582 (0x560e75b27582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #46: PyObject_Call + 0xbc (0x560e75b27f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x560e75b0e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55e94db8ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55e94db7bc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55e94db8ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: frame #48: + 0x150582 (0x560e75b27582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #49: PyObject_Call + 0xbc (0x560e75b27f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x560e75b0e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55e94db7c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #45: + 0x150582 (0x55e94db97582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #46: PyObject_Call + 0xbc (0x55e94db97f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55e94db7e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #48: + 0x150582 (0x55e94db97582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #49: PyObject_Call + 0xbc (0x55e94db97f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55e94db7e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin[default2]:[rank26]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f3c48325c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank26]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f3c48325f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #51: _PyFunction_Vectorcall + 0x6c (0x560e75b1ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x560e75b14007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -/python3.10) -[default3]:[rank43]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55e94db8ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: frame #53: _PyObject_Call_Prepend + 0x69 (0x560e75b25c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #54: + 0x211239 (0x560e75be8239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #55: PyObject_Call + 0x207 (0x560e75b28067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x560e75b0e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #57: + 0x150582 (0x560e75b27582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x560e75b0c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #59: + 0x150582 (0x560e75b27582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-clust[default3]:[rank43]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55e94db84007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55e94db95c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #54: + 0x211239 (0x55e94dc58239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #55: PyObject_Call + 0x207 (0x55e94db98067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55e94db7e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank48]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7c7b43a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7c7b43a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -er/bin/python3.10) -[default3]:[rank43]: frame #57: + 0x150582 (0x55e94db97582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55e94db7c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #59: + 0x150582 (0x55e94db97582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: return func(*args, **kwargs) -[default2]:[rank26]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f3c48326fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7c7b43a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8c6f5ff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank39]: frame #60: PyObject_Call + 0xbc (0x560e75b27f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x560e75b0e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #62: + 0x150582 (0x560e75b27582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #63: PyObject_Call + 0xbc (0x560e75b27f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #60: PyObject_Call + 0xbc (0x55e94db97f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55e94db7e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #62: + 0x150582 (0x55e94db97582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa10fdd8f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #1: + 0x5b3a23e (0x7f8ca911c23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank43]: frame #63: PyObject_Call + 0xbc (0x55e94db97f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f8ca9116c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default3]:[rank27]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa10fdd9fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa10fd8e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f8ca9116f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f8ca9117fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.p[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8ca90cc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -y", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f132368f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank26]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3c482db371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7c7b43a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from[default1]:[rank25]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa10fd8e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f7c42c47189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag[default3]:[rank27]: frame #1: + 0x5b3a23e (0x7f135d1ac23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f7c42c4e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f7c42c6d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: frame #12: + 0x5adc309 (0x7f7c7b42c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default6]:[rank46]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank25]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa10fd8e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank26]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3c482db371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8ca90cc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8ca90cc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8ca90cc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f967ea5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank46]: frame #1: + 0x5b3a23e (0x7f96b857923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f135d1a6c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: frame #13: + 0x5ae6f10 (0x7f7c7b436f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f96b8573c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank24]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank55]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f8c708d9189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f96b8573f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f96b8574fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f96b8529371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f96b8529371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f96b8529371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libto[default1]:[rank25]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa10fd8e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f8c708e0610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f8c708ff978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -rch_cpu.so) -[default0]:[rank24]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank48]: frame #14: + 0x5ae6fa5 (0x7f7c7b436fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #15: + 0x5124446 (0x7f7c7aa74446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f96b8529371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: dist.recv( -[default7]:[rank55]: frame #12: + 0x5adc309 (0x7f8ca90be309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f967fd36189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f967fd3d610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f967fd5c978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #12: + 0x5adc309 (0x7f96b851b309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f135d1a6f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #13: + 0x5ae6f10 (0x7f8ca90c8f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #13: + 0x5ae6f10 (0x7f96b8525f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank26]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3c482db371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank26]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3c482db371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #16: + 0x1acf4b8 (0x7f7c7741f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #14: + 0x5ae6fa5 (0x7f96b8525fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #15: + 0x5124446 (0x7f96b7b63446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #16: + 0x1acf4b8 (0x7f96b450e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa0d759b189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #17: + 0x5aee004 (0x7f7c7b43e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #17: + 0x5aee004 (0x7f96b852d004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #18: + 0x5af36b5 (0x7f96b85326b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: frame #14: + 0x5ae6fa5 (0x7f8ca90c8fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #15: + 0x5124446 (0x7f8ca8706446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #19: + 0xd2631e (0x7f96cb11c31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #20: + 0x47def4 (0x7f96ca873ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #21: + 0x1445a6 (0x563a1f34a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563a1f343a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #23: + 0x150866 (0x563a1f356866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f343d838897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank26]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f3c0fae8189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank27]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f135d1a7fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #16: + 0x1acf4b8 (0x7f8ca50b14b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563a1f33f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563a1f34aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #26: PyObject_Call + 0xbc (0x563a1f356f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: return func(*args, **kwargs) -[default0]:[rank48]: frame #18: + 0x5af36b5 (0x7f7c7b4436b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #19: + 0xd2631e (0x7f7c8e02d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default6]:[rank46]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563a1f33d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563a1f34aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563a1f33b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #30: + 0x150582 (0x563a1f356582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa0d75a2610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #20: + 0x47def4 (0x7f7c8d784ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563a1f33b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #32: + 0x150582 (0x563a1f356582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563a1f33b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #1: + 0x5b3a23e (0x7f347735523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #17: + 0x5aee004 (0x7f8ca90d0004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #34: + 0x150582 (0x563a1f356582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563a1f33b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563a1f342f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563a1f354c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #38: + 0x211239 (0x563a1f417239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f3c0faef610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #18: + 0x5af36b5 (0x7f8ca90d56b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #19: + 0xd2631e (0x7f8cbbcbf31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #20: + 0x47def4 (0x7f8cbb416ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563a1f343a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563a1f33f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563a1f34aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa0d75c1978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #21: + 0x1445a6 (0x55ca21d765a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55ca21d6fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default[default6]:[rank46]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563a1f33ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563a1f34aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563a1f33b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f347734fc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #23: + 0x150866 (0x55ca21d82866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55ca21d6b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/p[default6]:[rank46]: frame #45: + 0x150582 (0x563a1f356582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #46: PyObject_Call + 0xbc (0x563a1f356f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563a1f33d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #48: + 0x150582 (0x563a1f356582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #49: PyObject_Call + 0xbc (0x563a1f356f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563a1f33d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563a1f34aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/pyt[default2]:[rank26]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f3c0fb0e978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55ca21d76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #21: + 0x1445a6 (0x55a1511425a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -arallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -hon3.10) -[default6]:[rank46]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563a1f343007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563a1f354c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #54: + 0x211239 (0x563a1f417239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #55: PyObject_Call + 0x207 (0x563a1f357067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563a1f33d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #57: + 0x150582 (0x563a1f356582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default3]:[rank27]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f135d15c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #26: PyObject_Call + 0xbc (0x55ca21d82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38[default6]:[rank46]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563a1f33b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #59: + 0x150582 (0x563a1f356582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #60: PyObject_Call + 0xbc (0x563a1f356f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563a1f33d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #62: + 0x150582 (0x563a1f356582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f347734ff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a15113ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #23: + 0x150866 (0x55a15114e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default6]:[rank38]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank38]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2264a9d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank38]: frame #1: + 0x5b3a23e (0x7f229e5ba23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f229e5b4c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38[default6]:[rank46]: frame #63: PyObject_Call + 0xbc (0x563a1f356f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank25]: frame #12: + 0x5adc309 (0x7fa10fd80309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a151137142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f229e5b4f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f229e5b5fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f229e56a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank26]: frame #12: + 0x5adc309 (0x7f3c482cd309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a151142a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55ca21d692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f229e56a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f229e56a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f229e56a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2265d77189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f22[default2]:[rank26]: frame #13: + 0x5ae6f10 (0x7f3c482d7f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55ca21d76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55ca21d678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -65d7e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2265d9d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank24]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f3477350fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #30: + 0x150582 (0x55ca21d82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55ca21d678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #12: + 0x5adc309 (0x7f229e55c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #13: + 0x5ae6f10 (0x7f229e566f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank28]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank48]: frame #32: + 0x150582 (0x55ca21d82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55ca21d678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #14: + 0x5ae6fa5 (0x7f229e566fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default0]:[rank24]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3477305371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #34: + 0x150582 (0x55ca21d82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55ca21d678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #15: + 0x5124446 (0x7f229dba4446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #16: + 0x1acf4b8 (0x7f229a54f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -n/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward[default3]:[rank27]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f135d15c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f135d15c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b3a915897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank48]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55ca21d6ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #17: + 0x5aee004 (0x7f229e56e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #18: + 0x5af36b5 (0x7f229e5736b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #19: + 0xd2631e (0x7f22b115d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) - -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank25]: frame #13: + 0x5ae6f10 (0x7fa10fd8af10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55ca21d80c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #38: + 0x211239 (0x55ca21e43239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55ca21d6fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #20: + 0x47def4 (0x7f22b08b4ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #21: + 0x1445a6 (0x5595e45cb5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5595e45c4a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57[default3]:[rank27]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f135d15c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #26: PyObject_Call + 0xbc (0x55a15114ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #23: + 0x150866 (0x5595e45d7866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5595e45c0142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5595e45cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default4]:[rank28]: frame #1: + 0x5b3a23e (0x7f5b7443223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1511352b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a151142a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a1511338fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #26: PyObject_Call + 0xbc (0x5595e45d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5595e45be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default2]:[rank26]: frame #14: + 0x5ae6fa5 (0x7f3c482d7fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #30: + 0x150582 (0x55a15114e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55ca21d6b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55ca21d76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5595e45cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5595e45bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #30: + 0x150582 (0x5595e45d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5595e45bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #32: + 0x150582 (0x5595e45d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5595e45bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #34: + 0x150582 (0x5595e45d7582 in /fsx/ferdinandmom/miniforge3/envs/env[default1]:[rank57]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank57]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0d1660a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank57]: frame #1: + 0x5b3a23e (0x7f0d5012723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f1324969189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank27]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f1324970610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a1511338fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #32: + 0x150582 (0x55a15114e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a1511338fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #34: + 0x150582 (0x55a15114e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a1511338fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a15113af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) --bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5595e45bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5595e45c3f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5595e45d5c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #38: + 0x211239 (0x5595e4698239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5595e45c4a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0d50121c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0d50121f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f132498f978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank26]: frame #15: + 0x5124446 (0x7f3c47915446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: frame #14: + 0x5ae6fa5 (0x7fa10fd8afa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a15114cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55ca21d66c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55ca21d76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55ca21d678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5595e45c03e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5595e45cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5595e45bbc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5595e45cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5595e45bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #45: + 0x150582 (0x5595e45d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0d50122fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0d500d7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0d500d7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: frame #15: + 0x5124446 (0x7fa10f3c8446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #38: + 0x211239 (0x55a15120f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a15113ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #46: PyObject_Call + 0xbc (0x5595e45d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5595e45be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #48: + 0x150582 (0x5595e45d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0d500d7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0d500d7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f0d178e4189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank28]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5b7442cc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #45: + 0x150582 (0x55ca21d82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #49: PyObject_Call + 0xbc (0x5595e45d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5595e45be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5595e45cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5595e45c4007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #12: + 0x5adc309 (0x7f135d14e309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a1511373e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5595e45d5c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #54: + 0x211239 (0x5595e4698239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #55: PyObject_Call + 0x207 (0x5595e45d8067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5595e45be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #16: + 0x1acf4b8 (0x7fa10bd734b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #46: PyObject_Call + 0xbc (0x55ca21d82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55ca21d692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #57: + 0x150582 (0x5595e45d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5595e45bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #59: + 0x150582 (0x5595e45d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5b7442cf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a151142a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #60: PyObject_Call + 0xbc (0x5595e45d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5595e45be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #13: + 0x5ae6f10 (0x7f135d158f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #48: + 0x150582 (0x55ca21d82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #62: + 0x150582 (0x5595e45d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #63: PyObject_Call + 0xbc (0x5595e45d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank57]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0d178eb610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0d1790a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #12: + 0x5adc309 (0x7f0d500c9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #13: + 0x5ae6f10 (0x7f0d500d3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank26]: frame #16: + 0x1acf4b8 (0x7f3c442c04b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a151132c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank57]: frame #14: + 0x5ae6fa5 (0x7f0d500d3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #15: + 0x5124446 (0x7f0d4f711446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #16: + 0x1acf4b8 (0x7f0d4c0bc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #17: + 0x5aee004 (0x7f0d500db004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #18: + 0x5af36b5 (0x7f0d500e06b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5b7442dfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a151142a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: frame #19: + 0xd2631e (0x7f0d62cca31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #20: + 0x47def4 (0x7f0d62421ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #21: + 0x1445a6 (0x561a98b4e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #17: + 0x5aee004 (0x7fa10fd92004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank24]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3477305371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank24]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3477305371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a1511338fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #45: + 0x150582 (0x55a15114e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default1]:[rank57]: frame #22: _PyObject_MakeTpCall + 0x26b (0x561a98b47a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #23: + 0x150866 (0x561a98b5a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x561a98b43142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #25: _PyFunction_Vectorcall + 0x6c (0x561a98b4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5b743e2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #46: PyObject_Call + 0xbc (0x55a15114ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[defaul[default1]:[rank57]: frame #26: PyObject_Call + 0xbc (0x561a98b5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x561a98b412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #28: _PyFunction_Vectorcall + 0x6c (0x561a98b4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #17: + 0x5aee004 (0x7f3c482df004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1511352b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #49: PyObject_Call + 0xbc (0x55ca21d82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -t1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank57]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x561a98b3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #30: + 0x150582 (0x561a98b5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x561a98b3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #18: + 0x5af36b5 (0x7f3c482e46b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #48: + 0x150582 (0x55a15114e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default1]:[rank33]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank33]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe5c2de3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank33]: frame #1: + 0x5b3a23e (0x7fe5fc90023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #2:[default1]:[rank57]: frame #32: + 0x150582 (0x561a98b5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x561a98b3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #34: + 0x150582 (0x561a98b5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x561a98b3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #18: + 0x5af36b5 (0x7fa10fd976b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #49: PyObject_Call + 0xbc (0x55a15114ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) - c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fe5fc8fac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fe5fc8faf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fe5fc8fbfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe5fc8b0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe5fc8b0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.[default1]:[rank57]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x561a98b46f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #37: _PyObject_Call_Prepend + 0x69 (0x561a98b58c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #38: + 0x211239 (0x561a98c1b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5b743e2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55ca21d692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe5fc8b0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe5fc8b0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #39: _PyObject_MakeTpCall + 0x26b (0x561a98b47a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x561a98b433e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #41: _PyFunction_Vectorcall + 0x6c (0x561a98b4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x561a98b3ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3477305371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #14: + 0x5ae6fa5 (0x7f135d158fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1511352b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a151142a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a15113b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fe5c40bd189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fe5c40c4610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fe5c40e3978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #12: + 0x5adc309 (0x7fe5fc8a2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #13: [default1]:[rank57]: frame #43: _PyFunction_Vectorcall + 0x6c (0x561a98b4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x561a98b3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #45: + 0x150582 (0x561a98b5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #46: PyObject_Call + 0xbc (0x561a98b5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #19: + 0xd2631e (0x7fa12298131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55ca21d76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -+ 0x5ae6f10 (0x7fe5fc8acf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x561a98b412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #48: + 0x150582 (0x561a98b5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #49: PyObject_Call + 0xbc (0x561a98b5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f343eb12189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55ca21d6f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #14: + 0x5ae6fa5 (0x7fe5fc8acfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #15: + 0x5124446 (0x7fe5fbeea446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x561a98b412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #51: _PyFunction_Vectorcall + 0x6c (0x561a98b4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x561a98b47007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #53: _PyObject_Call_Prepend + 0x69 (0x561a98b58c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #15: + 0x5124446 (0x7f135c796446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55ca21d80c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #16: + 0x1acf4b8 (0x7fe5f88954b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #17: + 0x5aee004 (0x7fe5fc8b4004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #54: + 0x211239 (0x561a98c1b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #55: PyObject_Call + 0x207 (0x561a98b5b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x561a98b412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #57: + 0x150582 (0x561a98b5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #20: + 0x47def4 (0x7fa1220d8ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank25]: frame #21: + 0x1445a6 (0x55840b3cb5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #18: + 0x5af36b5 (0x7fe5fc8b96b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #19: + 0xd2631e (0x7fe60f4a331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #20: + 0x47def4 (0x7fe60ebfaef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x561a98b3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #59: + 0x150582 (0x561a98b5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #60: PyObject_Call + 0xbc (0x561a98b5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #19: + 0xd2631e (0x7f3c5aece31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #21: + 0x1445a6 (0x56507cd2e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56507cd27a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #23: + 0x150866 (0x56507cd3a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x561a98b412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #62: + 0x150582 (0x561a98b5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #63: PyObject_Call + 0xbc (0x561a98b5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank24]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f343eb19610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56507cd23142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56507cd2ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #26: PyObject_Call + 0xbc (0x56507cd3af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #16: + 0x1acf4b8 (0x7f13591414b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55840b3c4a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a15114cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56507cd212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56507cd2ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56507cd1f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #30: + 0x150582 (0x56507cd3a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56507cd1f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #32: + 0x150582 (0x56507cd3a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56507cd1f8fa in /fsx/ferdinandmom/miniforge3/envs[default2]:[rank26]: frame #20: + 0x47def4 (0x7f3c5a625ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #54: + 0x211239 (0x55ca21e43239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #55: PyObject_Call + 0x207 (0x55ca21d83067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5b743e2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55ca21d692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #57: + 0x150582 (0x55ca21d82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55ca21d678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #34: + 0x150582 (0x56507cd3a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56507cd1f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56507cd26f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56507cd38c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #38: + 0x211239 (0x56507cdfb239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56507cd27a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56507cd233e6 in /fsx/ferdinandmom/miniforge3/envs/en[default0]:[rank24]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f343eb38978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #59: + 0x150582 (0x55ca21d82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #54: + 0x211239 (0x55a15120f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -v-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56507cd2ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: frame #17: + 0x5aee004 (0x7f135d160004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #18: + 0x5af36b5 (0x7f135d1656b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #55: PyObject_Call + 0x207 (0x55a15114f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56507cd1ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56507cd2ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56507cd1f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: trainer.train(dataloader) -[default5]:[rank61]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: frame #19: + 0xd2631e (0x7f136fd4f31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank24]: frame #12: + 0x5adc309 (0x7f34772f7309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1511352b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #45: + 0x150582 (0x56507cd3a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #46: PyObject_Call + 0xbc (0x56507cd3af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56507cd212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #48: + 0x150582 (0x56507cd3a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: frame #21: + 0x1445a6 (0x5585de82d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #57: + 0x150582 (0x55a15114e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #49: PyObject_Call + 0xbc (0x56507cd3af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56507cd212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56507cd2ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank26]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5585de826a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #60: PyObject_Call + 0xbc (0x55ca21d82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56507cd27007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56507cd38c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #54: + 0x211239 (0x56507cdfb239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: frame #20: + 0x47def4 (0x7f136f4a6ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a1511338fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #59: + 0x150582 (0x55a15114e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #55: PyObject_Call + 0x207 (0x56507cd3b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56507cd212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #57: + 0x150582 (0x56507cd3a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56507cd1f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: frame #13: + 0x5ae6f10 (0x7f3477301f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #60: PyObject_Call + 0xbc (0x55a15114ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55ca21d692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #59: + 0x150582 (0x56507cd3a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #60: PyObject_Call + 0xbc (0x56507cd3af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56507cd212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5b743e2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #62: + 0x150582 (0x55ca21d82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #62: + 0x150582 (0x56507cd3a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #63: PyObject_Call + 0xbc (0x56507cd3af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: output = model(**micro_batch) -[default3]:[rank27]: frame #21: + 0x1445a6 (0x55c39b55b5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #63: PyObject_Call + 0xbc (0x55ca21d82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c39b554a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #23: + 0x150866 (0x55c39b567866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1511352b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c39b550142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #23: + 0x150866 (0x5585de839866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #62: + 0x150582 (0x55a15114e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #63: PyObject_Call + 0xbc (0x55a15114ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: frame #14: + 0x5ae6fa5 (0x7f3477301fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: frame #15: + 0x5124446 (0x7f347693f446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c39b55ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank63]: output = model(**micro_batch) -[default2]:[rank26]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5585de822142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5585de82da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5b3bbef189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: frame #26: PyObject_Call + 0xbc (0x55c39b567f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default1]:[rank25]: frame #23: + 0x150866 (0x55840b3d7866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default0]:[rank24]: frame #16: + 0x1acf4b8 (0x7f34732ea4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default4]:[rank28]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5b3bbf6610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank28]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5b3bc15978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: frame #17: + 0x5aee004 (0x7f3477309004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: Traceback (most recent call last): -[default5]:[rank61]: sharded_logits = self.model( -[default2]:[rank26]: frame #26: PyObject_Call + 0xbc (0x5585de839f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55840b3c0142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c39b54e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: frame #18: + 0x5af36b5 (0x7f347730e6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: frame #12: + 0x5adc309 (0x7f5b743d4309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c39b55ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c39b54c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55840b3cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: frame #19: + 0xd2631e (0x7f3489ef831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default0]:[rank24]: frame #20: + 0x47def4 (0x7f348964fef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank27]: frame #30: + 0x150582 (0x55c39b567582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5585de8202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: frame #21: + 0x1445a6 (0x5629c9fb45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c39b54c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5629c9fada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5585de82da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: frame #26: PyObject_Call + 0xbc (0x55840b3d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: frame #13: + 0x5ae6f10 (0x7f5b743def10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5585de81e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55840b3be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: frame #23: + 0x150866 (0x5629c9fc0866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5629c9fa9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55840b3cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: frame #30: + 0x150582 (0x5585de839582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default3]:[rank27]: frame #32: + 0x150582 (0x55c39b567582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank25]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55840b3bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #14: + 0x5ae6fa5 (0x7f5b743defa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank24]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5629c9fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default2]:[rank50]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default1]:[rank25]: frame #30: + 0x150582 (0x55840b3d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb6296c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank50]: frame #1: + 0x5b3a23e (0x7fb6631e523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c39b54c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5585de81e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb6631dfc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb6631dff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[default4]:[rank28]: frame #15: + 0x5124446 (0x7f5b73a1c446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb6631e0fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb663195371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb663195371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb663195371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb663195371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libt[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank17]: new_kwargs[name] = recv_from_pipeline_s[default3]:[rank27]: frame #34: + 0x150582 (0x55c39b567582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -orch_cpu.so) -[default2]:[rank50]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb62a9a2189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb62a9a9610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb62a9c8978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #12: + 0x5adc309 (0x7fb663187309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #13: + 0x5ae6f10 (0x7fb663191f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -ult1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank26]: frame #32: + 0x150582 (0x5585de839582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #14: + 0x5ae6fa5 (0x7fb663191fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #15: + 0x5124446 (0x7fb6627cf446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank17]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank17]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa688a63897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10[default4]:[rank28]: frame #16: + 0x1acf4b8 (0x7f5b703c74b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #16: + 0x1acf4b8 (0x7fb65f17a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #17: + 0x5aee004 (0x7fb663199004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #18: + 0x5af36b5 (0x7fb66319e6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: return forward_call(*args, **kwargs) -/site-packages/torch/lib/libc10.so) -[default1]:[rank17]: frame #1: + 0x5b3a23e (0x7fa6c258023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa6c257ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa6c257af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa6c257bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c39b54c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #19: + 0xd2631e (0x7fb675d8831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #20: + 0x47def4 (0x7fb6754dfef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #21: + 0x1445a6 (0x55eb7db355a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55eb7db2ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank17]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa6c2530371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa6c2530371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa6c2530371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa6c2530371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa689d3d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench[default0]:[rank24]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c9fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5629c9fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #23: + 0x150866 (0x55eb7db41866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55eb7db2a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55eb7db35a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #26: PyObject_Call + 0xbc (0x55eb7db41f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors --cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank26]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5585de81e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb7db282b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55eb7db35a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55eb7db268fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #30: + 0x150582 (0x55eb7db41582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank17]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa689d44610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank17]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa689d63978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank17]: frame #12: + 0x5adc309 (0x7fa6c2522309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #13: + 0x5ae6f10 (0x7fa6c252cf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: frame #17: + 0x5aee004 (0x7f5b743e6004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55eb7db268fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #32: + 0x150582 (0x55eb7db41582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55eb7db268fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #34: + 0x150582 (0x55eb7db41582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: frame #14: + 0x5ae6fa5 (0x7fa6c252cfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c39b553f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5629c9fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55eb7db268fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55eb7db2df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55eb7db3fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank17]: frame #15: + 0x5124446 (0x7fa6c1b6a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #16: + 0x1acf4b8 (0x7fa6be5154b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #17: + 0x5aee004 (0x7fa6c2534004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: frame #32: + 0x150582 (0x55840b3d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #18: + 0x5af36b5 (0x7f5b743eb6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #38: + 0x211239 (0x55eb7dc02239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55eb7db2ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55eb7db2a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55eb7db35a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank17]: frame #18: + 0x5af36b5 (0x7fa6c25396b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #19: + 0xd2631e (0x7fa6d512331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank26]: frame #34: + 0x150582 (0x5585de839582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #30: + 0x150582 (0x5629c9fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5629c9fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55eb7db25c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55eb7db35a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55eb7db268fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #45: + 0x150582 (0x55eb7db41582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: pipeline_state.run_communication() -[default1]:[rank17]: frame #20: + 0x47def4 (0x7fa6d487aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank17]: frame #21: + 0x1445a6 (0x5638e6d475a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #19: + 0xd2631e (0x7f5b86fd531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #46: PyObject_Call + 0xbc (0x55eb7db41f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb7db282b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #48: + 0x150582 (0x55eb7db41582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #49: PyObject_Call + 0xbc (0x55eb7db41f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default1]:[rank17]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5638e6d40a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #23: + 0x150866 (0x5638e6d53866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5638e6d3c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5638e6d47a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #20: + 0x47def4 (0x7f5b8672cef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb7db282b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55eb7db35a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55eb7db2e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55eb7db3fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank61]: dist.recv( -[default1]:[rank17]: frame #26: PyObject_Call + 0xbc (0x5638e6d53f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5638e6d3a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5638e6d47a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c39b565c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #54: + 0x211239 (0x55eb7dc02239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #55: PyObject_Call + 0x207 (0x55eb7db42067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb7db282b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #57: + 0x150582 (0x55eb7db41582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5638e6d388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #30: + 0x150582 (0x5638e6d53582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5638e6d388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #32: + 0x150582 (0x5638e6d53582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5638e6d388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #34: + 0x150582 (0x5638e6d53582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5638e6d388fa in /fsx/ferdinandmom/miniforge3/envs[default1]:[rank25]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55840b3bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55eb7db268fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #59: + 0x150582 (0x55eb7db41582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #60: PyObject_Call + 0xbc (0x55eb7db41f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb7db282b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #62: + 0x150582 (0x55eb7db41582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #63: PyObject_Call + 0xbc (0x55eb7db41f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5638e6d3ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5585de81e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5585de825f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[defaul[default5]:[rank61]: return func(*args, **kwargs) -[default1]:[rank17]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5638e6d51c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #38: + 0x211239 (0x5638e6e14239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5638e6d40a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5638e6d3c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5638e6d47a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5638e6d37c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5638e6d47a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench[default1]:[rank25]: frame #34: + 0x150582 (0x55840b3d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -t7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() --cluster/bin/python3.10) -[default1]:[rank17]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5638e6d388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #45: + 0x150582 (0x5638e6d53582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #46: PyObject_Call + 0xbc (0x5638e6d53f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5638e6d3a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55840b3bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #32: + 0x150582 (0x5629c9fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #21: + 0x1445a6 (0x55c5f24e45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default7]:[rank47]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47][default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: frame #48: + 0x150582 (0x5638e6d53582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #49: PyObject_Call + 0xbc (0x5638e6d53f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5638e6d3a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5638e6d47a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5638e6d40007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55840b3c3f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef515db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: frame #1: + 0x5b3a23e (0x7fef8b0f823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fef8b0f2c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fef8b0f2f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fef8b0f3fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/si[default5]:[rank61]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default5]:[rank61]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank17]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5638e6d51c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #54: + 0x211239 (0x5638e6e14239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #55: PyObject_Call + 0x207 (0x5638e6d54067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5638e6d3a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5585de837c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -te-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fef8b0a8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fef8b0a8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fef8b0a8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fef8b0a8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fef528b5189 in /fsx[default5]:[rank61]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35695d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank17]: frame #57: + 0x150582 (0x5638e6d53582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5638e6d388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #59: + 0x150582 (0x5638e6d53582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c5f24dda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fef528bc610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fef528db978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #12: + 0x5adc309 (0x7fef8b09a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #13: + 0x5ae6f10 (0x7fef8b0a4f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: frame #60: PyObject_Call + 0xbc (0x5638e6d53f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5638e6d3a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: frame #62: + 0x150582 (0x5638e6d53582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #23: + 0x150866 (0x55c5f24f0866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55840b3d5c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #38: + 0x211239 (0x55840b498239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #14: + 0x5ae6fa5 (0x7fef8b0a4fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #1: + 0x5b3a23e (0x7f35a30f123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank17]: frame #63: PyObject_Call + 0xbc (0x5638e6d53f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank17]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank24]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5629c9fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #15: + 0x5124446 (0x7fef8a6e2446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #16: + 0x1acf4b8 (0x7fef8708d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #17: + 0x5aee004 (0x7fef8b0ac004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #18: + 0x5af36b5 (0x7fef8b0b16b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #19: + 0xd2631e (0x7fef9dc9b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default2]:[rank26]: frame #38: + 0x211239 (0x5585de8fa239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #20: + 0x47def4 (0x7fef9d3f2ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #21: + 0x1445a6 (0x56469054d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #22: _PyObject_MakeTpCall + 0x26b (0x564690546a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank61]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f35a30ebc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default3]:[rank27]: frame #38: + 0x211239 (0x55c39b628239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c5f24d9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #23: + 0x150866 (0x564690559866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x564690542142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56469054da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.p[default0]:[rank24]: frame #34: + 0x150582 (0x5629c9fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #26: PyObject_Call + 0xbc (0x564690559f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5646905402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -y", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55840b3c4a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5585de826a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56469054da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56469053e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #30: + 0x150582 (0x564690559582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56469053e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank20]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank24]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5629c9fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #32: + 0x150582 (0x564690559582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56469053e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #34: + 0x150582 (0x564690559582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank20]: recv_activation_tensor = recv_activation() -[default4]:[rank28]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c5f24e4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5585de8223e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56469053e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x564690545f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f35a30ebf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c39b554a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c39b5503e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #37: _PyObject_Call_Prepend + 0x69 (0x564690557c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #38: + 0x211239 (0x56469061a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #39: _PyObject_MakeTpCall + 0x26b (0x564690546a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5646905423e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56469054da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f35a30ecfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank25]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55840b3c03e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56469053dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56469054da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56469053e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #45: + 0x150582 (0x564690559582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #46: PyObject_Call + 0xbc (0x564690559f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5646905402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #48: + 0x150582 (0x564690559582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-clu[default7]:[rank63]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank20]: dist.recv( -[default0]:[rank24]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5629c9facf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5585de82da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ster/bin/python3.10) -[default7]:[rank47]: frame #49: PyObject_Call + 0xbc (0x564690559f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd326219897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f35a30a1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank28]: frame #26: PyObject_Call + 0xbc (0x55c5f24f0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5646905402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56469054da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x564690546007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #53: _PyObject_Call_Prepend + 0x69 (0x564690557c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f35a30a1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f35a30a1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f35a30a1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #1: + 0x5b3a23e (0x7fd35fd3623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5629c9fbec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #38: + 0x211239 (0x5629ca081239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #54: + 0x211239 (0x56469061a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #55: PyObject_Call + 0x207 (0x56469055a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5646905402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #57: + 0x150582 (0x564690559582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56469053e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #59: + 0x150582 (0x564690559582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd35fd30c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd35fd30f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd35fd31fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank28]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5f24d72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #60: PyObject_Call + 0xbc (0x564690559f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5646905402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #62: + 0x150582 (0x564690559582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #63: PyObject_Call + 0xbc (0x564690559f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd35fce6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank27]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c39b55ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5585de81dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank63]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd35fce6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd35fce6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd35fce6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05378a7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank20]: frame #1: + 0x5b3a23e (0x7f05713c423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55840b3cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f356a8ae189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank20]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f05713bec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f05713bef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank25]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55840b3bbc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd3274f3189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank20]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f05713bffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0571374371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0571374371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c5f24e4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f356a8b5610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank20]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0571374371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0571374371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank24]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5629c9fada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f356a8d4978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd3274fa610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd327519978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #12: + 0x5adc309 (0x7fd35fcd8309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f0538b81189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank20]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0538b88610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank20]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0538ba7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank25]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55840b3cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank63]: frame #13: + 0x5ae6f10 (0x7fd35fce2f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #12: + 0x5adc309 (0x7f0571366309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #13: + 0x5ae6f10 (0x7f0571370f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank28]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c5f24d58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #12: + 0x5adc309 (0x7f35a3093309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #14: + 0x5ae6fa5 (0x7f0571370fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #15: + 0x5124446 (0x7f05709ae446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #16: + 0x1acf4b8 (0x7f056d3594b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank27]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c39b54bc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #13: + 0x5ae6f10 (0x7f35a309df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #17: + 0x5aee004 (0x7f0571378004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #18: + 0x5af36b5 (0x7f057137d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #19: + 0xd2631e (0x7f0583f6731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank26]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5585de82da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55840b3bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: frame #14: + 0x5ae6fa5 (0x7fd35fce2fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #20: + 0x47def4 (0x7f05836beef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank20]: frame #21: + 0x1445a6 (0x55c7f33bd5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #30: + 0x150582 (0x55c5f24f0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36][default7]:[rank63]: frame #15: + 0x5124446 (0x7fd35f320446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c7f33b6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #23: + 0x150866 (0x55c7f33c9866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c7f33b2142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c5f24d58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c39b55ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default7]:[rank63]: frame #16: + 0x1acf4b8 (0x7fd35bccb4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #17: + 0x5aee004 (0x7fd35fcea004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #18: + 0x5af36b5 (0x7fd35fcef6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c7f33bda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #26: PyObject_Call + 0xbc (0x55c7f33c9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #45: + 0x150582 (0x55840b3d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c39b54c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/be[default7]:[rank63]: frame #19: + 0xd2631e (0x7fd3728d931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank20]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c7f33b02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c7f33bda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c7f33ae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #30: + 0x150582 (0x55c7f33c9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5585de81e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5629c9fa93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5629c9fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -nch_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, i[default7]:[rank63]: frame #20: + 0x47def4 (0x7fd372030ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank20]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c7f33ae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #32: + 0x150582 (0x55c7f33c9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c7f33ae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: trainer.train(dataloader) -[default3]:[rank27]: frame #45: + 0x150582 (0x55c39b567582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #45: + 0x150582 (0x5585de839582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -n recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: frame #14: + 0x5ae6fa5 (0x7f35a309dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #34: + 0x150582 (0x55c7f33c9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c7f33ae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #46: PyObject_Call + 0xbc (0x55840b3d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank53]: dist.recv( -[default4]:[rank36]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default4]:[rank36]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank36]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f64e3f64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #21: + 0x1445a6 (0x5558aafba5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c7f33b5f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c7f33c7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #38: + 0x211239 (0x55c7f348a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c7f33b6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #46: PyObject_Call + 0xbc (0x55c39b567f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: frame #1: + 0x5b3a23e (0x7f651da8123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #15: + 0x5124446 (0x7f35a26db446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c7f33b23e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c7f33bda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c7f33adc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #46: PyObject_Call + 0xbc (0x5585de839f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5585de8202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5629c9fa4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c39b54e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default4]:[rank36]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f651da7bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f651da7bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f651da7cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5558aafb3a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c7f33bda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c7f33ae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #45: + 0x150582 (0x55c7f33c9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #46: PyObject_Call + 0xbc (0x55c7f33c9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c7f33b02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #48: + 0x150582 (0x55c7f33c9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #49: PyObject_Call + 0xbc (0x55c7f33c9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/pyt[default4]:[rank28]: frame #32: + 0x150582 (0x55c5f24f0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank53]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70d9840897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank36]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f651da31371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f651da31371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f651da31371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f651da31371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #16: + 0x1acf4b8 (0x7f359f0864b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -hon3.10) -[default4]:[rank20]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c7f33b02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #48: + 0x150582 (0x5585de839582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #1: + 0x5b3a23e (0x7f711335d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f7113357c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7113357f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f64e523e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f64e5245610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #17: + 0x5aee004 (0x7f35a30a5004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank20]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c7f33bda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c7f33b6007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c7f33c7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #54: + 0x211239 (0x55c7f348a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #55: PyObject_Call + 0x207 (0x55c7f33ca067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #49: PyObject_Call + 0xbc (0x5585de839f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7113358fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f711330d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f711330d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f64e5264978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #12: + 0x5adc309 (0x7f651da23309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #13: + 0x5ae6f10 (0x7f651da2df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #18: + 0x5af36b5 (0x7f35a30aa6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #19: + 0xd2631e (0x7f35b5c9431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank20]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c7f33b02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #57: + 0x150582 (0x55c7f33c9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f711330d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f711330d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f70dab1a189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f70dab21610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #14: + 0x5ae6fa5 (0x7f651da2dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #15: + 0x5124446 (0x7f651d06b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #16: + 0x1acf4b8 (0x7f6519a164b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #17: + 0x5aee004 (0x7f651da35004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #18: + 0x5af36b5 (0x7f651da3a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #19: + 0xd2631e (0x7f6530624[default5]:[rank61]: frame #20: + 0x47def4 (0x7f35b53ebef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #21: + 0x1445a6 (0x55f3450105a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c7f33ae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #59: + 0x150582 (0x55c7f33c9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #60: PyObject_Call + 0xbc (0x55c7f33c9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5629c9fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55840b3be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c5f24d58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f70dab40978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #12: + 0x5adc309 (0x7f71132ff309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #13: + 0x5ae6f10 (0x7f7113309f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #20: + 0x47def4 (0x7f652fd7bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55f345009a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c7f33b02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #62: + 0x150582 (0x55c7f33c9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: frame #63: PyObject_Call + 0xbc (0x55c7f33c9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: frame #14: + 0x5ae6fa5 (0x7f7113309fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #15: + 0x5124446 (0x7f7112947446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #16: + 0x1acf4b8 (0x7f710f2f24b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #17: + 0x5aee004 (0x7f7113311004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #18: + 0x5af36b5 (0x7f71133166b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #21: + 0x1445a6 (0x55b022ded5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b022de6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #23: + 0x150866 (0x55b022df9866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b022de2142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b022deda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #23: + 0x150866 (0x5558aafc6866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank20]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank25]: frame #48: + 0x150582 (0x55840b3d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #34: + 0x150582 (0x55c5f24f0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5629c9fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #19: + 0xd2631e (0x7f7125f0031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: frame #20: + 0x47def4 (0x7f7125657ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: frame #21: + 0x1445a6 (0x55c49d3825a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #26: PyObject_Call + 0xbc (0x55b022df9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b022de02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b022deda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b022dde8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #30: + 0x150582 (0x55b022df9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b022dde8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #32: + 0x150582 (0x55b022df9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cl[default7]:[rank63]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5558aafaf142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #23: + 0x150866 (0x55f34501c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: frame #45: + 0x150582 (0x5629c9fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #48: + 0x150582 (0x55c39b567582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c49d37ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #23: + 0x150866 (0x55c49d38e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c49d377142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c49d382a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #26: PyObject_Call + 0xbc (0x55c49d38ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c49d3752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c49d382a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-clusteruster/bin/python3.10) -[default4]:[rank36]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b022dde8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5558aafbaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -/bin/python3.10) -[default5]:[rank53]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c49d3738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #30: + 0x150582 (0x55c49d38e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #34: + 0x150582 (0x55b022df9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b022dde8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b022de5f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b022df7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #38: + 0x211239 (0x55b022eba239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b022de6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b022de23e6 in /fsx/ferdinandmom/miniforge3/envs/en[default5]:[rank61]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55f345005142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: frame #46: PyObject_Call + 0xbc (0x5629c9fc0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c49d3738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #32: + 0x150582 (0x55c49d38e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c49d3738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #34: + 0x150582 (0x55c49d38e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c49d3738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c49d37af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c49d38cc39 in /fsx/ferdinandmom/miniforge3/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b022deda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b022dddc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b022deda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b022dde8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #45: + 0x150582 (0x55b022df9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #26: PyObject_Call + 0xbc (0x5558aafc6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c5f24d58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -vs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #46: PyObject_Call + 0xbc (0x55b022df9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b022de02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #48: + 0x150582 (0x55b022df9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #49: PyObject_Call + 0xbc (0x55b022df9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b022de02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b022deda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b022de6007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/[default5]:[rank61]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55f345010a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: Traceback (most recent call last): -[default2]:[rank26]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5585de8202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #38: + 0x211239 (0x55c49d44f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c49d37ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c49d3773e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c49d382a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c49d372c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c49d382a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -bin/python3.10) -[default4]:[rank36]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b022df7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #26: PyObject_Call + 0xbc (0x55f34501cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5558aafad2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: frame #49: PyObject_Call + 0xbc (0x55c39b567f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c39b54e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c39b55ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c49d3738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #45: + 0x150582 (0x55c49d38e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #46: PyObject_Call + 0xbc (0x55c49d38ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c49d3752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #54: + 0x211239 (0x55b022eba239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #55: PyObject_Call + 0x207 (0x55b022dfa067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b022de02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #57: + 0x150582 (0x55b022df9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b022dde8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55f3450032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank26]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5585de82da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: frame #48: + 0x150582 (0x55c49d38e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #49: PyObject_Call + 0xbc (0x55c49d38ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c49d3752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c49d382a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #59: + 0x150582 (0x55b022df9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #60: PyObject_Call + 0xbc (0x55b022df9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b022de02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #62: + 0x150582 (0x55b022df9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5558aafbaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c9fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #48: + 0x150582 (0x5629c9fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c49d37b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c49d38cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #54: + 0x211239 (0x55c49d44f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #55: PyObject_Call + 0x207 (0x55c49d38f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #63: PyObject_Call + 0xbc (0x55b022df9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55f345010a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55f3450018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: trainer.train(dataloader) -[default3]:[rank27]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c39b554007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c5f24dcf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c49d3752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #57: + 0x150582 (0x55c49d38e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c49d3738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #59: + 0x150582 (0x55c49d38e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #30: + 0x150582 (0x55f34501c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55f3450018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5585de826007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #60: PyObject_Call + 0xbc (0x55c49d38ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c49d3752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #62: + 0x150582 (0x55c49d38e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #63: PyObject_Call + 0xbc (0x55c49d38ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #32: + 0x150582 (0x55f34501c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55f3450018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c39b565c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: frame #34: + 0x150582 (0x55f34501c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55f3450018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c5f24eec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55f345008f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: trainer.train(dataloader) -[default2]:[rank26]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5585de837c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default5]:[rank61]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55f34501ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5558aafab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: frame #49: PyObject_Call + 0xbc (0x5629c9fc0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #38: + 0x211239 (0x55f3450dd239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c9fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_clust[default5]:[rank61]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55f345009a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: frame #38: + 0x211239 (0x55c5f25b1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -er/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: frame #30: + 0x150582 (0x5558aafc6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: frame #49: PyObject_Call + 0xbc (0x55840b3d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5629c9fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5629c9fad007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5558aafab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #32: + 0x150582 (0x5558aafc6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank24]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5629c9fbec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: Traceback (most recent call last): -[default5]:[rank61]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55f3450053e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c5f24dda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5558aafab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55840b3be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55840b3cba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #34: + 0x150582 (0x5558aafc6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5558aafab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: output = model(**micro_batch) -[default2]:[rank18]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5558aafb2f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55f345010a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: frame #54: + 0x211239 (0x5585de8fa239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5558aafc4c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: frame #55: PyObject_Call + 0x207 (0x5585de83a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #54: + 0x211239 (0x5629ca081239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55f345000c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55f345010a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: frame #54: + 0x211239 (0x55c39b628239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55f3450018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5585de8202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: trainer.train(dataloader) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: frame #45: + 0x150582 (0x55f34501c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #46: PyObject_Call + 0xbc (0x55f34501cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default1]:[rank25]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55840b3c4007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default7]:[rank63]: frame #38: + 0x211239 (0x5558ab087239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5558aafb3a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank25]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55840b3d5c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5558aafaf3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: frame #55: PyObject_Call + 0x207 (0x55c39b568067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55f3450032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #48: + 0x150582 (0x55f34501c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: frame #55: PyObject_Call + 0x207 (0x5629c9fc1067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: frame #49: PyObject_Call + 0xbc (0x55f34501cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55f3450032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55f345010a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default3]:[rank27]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c39b54e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c5f24d93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5558aafbaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: frame #54: + 0x211239 (0x55840b498239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5558aafaac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55f345009007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c9fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5558aafbaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default4]:[rank28]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c5f24e4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5558aafab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #45: + 0x150582 (0x5558aafc6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: sharded_logits = self.model( -[default7]:[rank31]: sharded_logits = self.model( -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: frame #46: PyObject_Call + 0xbc (0x5558aafc6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: frame #57: + 0x150582 (0x5585de839582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5585de81e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55f34501ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c5f24d4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5558aafad2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: frame #55: PyObject_Call + 0x207 (0x55840b3d8067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55840b3be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: frame #54: + 0x211239 (0x55f3450dd239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: frame #55: PyObject_Call + 0x207 (0x55f34501d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #48: + 0x150582 (0x5558aafc6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #49: PyObject_Call + 0xbc (0x5558aafc6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: frame #59: + 0x150582 (0x5585de839582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank42]: Traceback (most recent call last): -[default7]:[rank63]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5558aafad2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: frame #60: PyObject_Call + 0xbc (0x5585de839f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5558aafbaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c5f24e4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c5f24d58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: dist.recv( -[default7]:[rank63]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5558aafb3007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5558aafc4c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5585de8202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: frame #54: + 0x211239 (0x5558ab087239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #55: PyObject_Call + 0x207 (0x5558aafc7067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default0]:[rank24]: frame #57: + 0x150582 (0x5629c9fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5629c9fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #59: + 0x150582 (0x5629c9fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55f3450032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default1]:[rank25]: frame #57: + 0x150582 (0x55840b3d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55840b3bc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #57: + 0x150582 (0x55f34501c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: frame #59: + 0x150582 (0x55840b3d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank26]: frame #62: + 0x150582 (0x5585de839582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5558aafad2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: frame #60: PyObject_Call + 0xbc (0x55840b3d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: trainer.train(dataloader) -[default7]:[rank63]: frame #57: + 0x150582 (0x5558aafc6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5558aafab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: sharded_logits = self.model( -[default1]:[rank25]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55840b3be2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #45: + 0x150582 (0x55c5f24f0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #60: PyObject_Call + 0xbc (0x5629c9fc0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank24]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c9fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: Traceback (most recent call last): -[default4]:[rank44]: output = model(**micro_batch) -[default7]:[rank63]: frame #59: + 0x150582 (0x5558aafc6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55f3450018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: frame #62: + 0x150582 (0x55840b3d7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank25]: frame #63: PyObject_Call + 0xbc (0x55840b3d7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: frame #60: PyObject_Call + 0xbc (0x5558aafc6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5558aafad2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: frame #57: + 0x150582 (0x55c39b567582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: frame #62: + 0x150582 (0x5558aafc6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #59: + 0x150582 (0x55f34501c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: frame #46: PyObject_Call + 0xbc (0x55c5f24f0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: frame #63: PyObject_Call + 0xbc (0x5558aafc6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank24]: frame #62: + 0x150582 (0x5629c9fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c39b54c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: frame #60: PyObject_Call + 0xbc (0x55f34501cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55f3450032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: frame #59: + 0x150582 (0x55c39b567582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: Traceback (most recent call last): -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: frame #62: + 0x150582 (0x55f34501c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #63: PyObject_Call + 0xbc (0x55f34501cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: Traceback (most recent call last): -[default0]:[rank40]: return func(*args, **kwargs) -[default5]:[rank61]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: frame #63: PyObject_Call + 0xbc (0x5585de839f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank37]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5f24d72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #48: + 0x150582 (0x55c5f24f0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: trainer.train(dataloader) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: frame #60: PyObject_Call + 0xbc (0x55c39b567f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c39b54e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank27]: frame #62: + 0x150582 (0x55c39b567582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: frame #63: PyObject_Call + 0xbc (0x5629c9fc0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: sharded_logits = self.model( -[default0]:[rank40]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank27]: frame #63: PyObject_Call + 0xbc (0x55c39b567f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe9ebcfa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank21]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank28]: frame #49: PyObject_Call + 0xbc (0x55c5f24f0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5f24d72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #1: + 0x5b3a23e (0x7fea2581723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank28]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c5f24e4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default4]:[rank28]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c5f24dd007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c5f24eec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #54: + 0x211239 (0x55c5f25b1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fea25811c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank21]: pipeline_state.run_communication() -[default4]:[rank28]: frame #55: PyObject_Call + 0x207 (0x55c5f24f1067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5f24d72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank28]: frame #57: + 0x150582 (0x55c5f24f0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c5f24d58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fea25811f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank21]: recv_activation_tensor = recv_activation() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank28]: frame #59: + 0x150582 (0x55c5f24f0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fea25812fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fea257c7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fea257c7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank28]: frame #60: PyObject_Call + 0xbc (0x55c5f24f0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c5f24d72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank28]: frame #62: + 0x150582 (0x55c5f24f0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fea257c7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default4]:[rank28]: frame #63: PyObject_Call + 0xbc (0x55c5f24f0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: pipeline_state.run_communication() -[default4]:[rank28]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f46753a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank31]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fea257c7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fe9ecfd4189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fe9ecfdb610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fe9ecffa978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #1: + 0x5b3a23e (0x7f46aeebf23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: recv_activation_tensor = recv_activation() -[default7]:[rank31]: recv_activation_tensor = recv_activation() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank58]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: Traceback (most recent call last): -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank31]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank40]: frame #12: + 0x5adc309 (0x7fea257b9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank31]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde7cf29897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank31]: frame #1: + 0x5b3a23e (0x7fdeb6a4623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank31]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fdeb6a40c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fdeb6a40f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: frame #13: + 0x5ae6f10 (0x7fea257c3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f46aeeb9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f46aeeb9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank31]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fdeb6a41fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdeb69f6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdeb69f6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank31]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdeb69f6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdeb69f6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fde7e203189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank31]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fde7e20a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank22]: pipeline_state.run_communication() -[default7]:[rank31]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fde7e229978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank31]: frame #12: + 0x5adc309 (0x7fdeb69e8309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #13: + 0x5ae6f10 (0x7fdeb69f2f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank40]: frame #14: + 0x5ae6fa5 (0x7fea257c3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default5]:[rank21]: dist.recv( -[default7]:[rank31]: frame #14: + 0x5ae6fa5 (0x7fdeb69f2fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #15: + 0x5124446 (0x7fdeb6030446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #16: + 0x1acf4b8 (0x7fdeb29db4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: frame #15: + 0x5124446 (0x7fea24e01446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank31]: frame #17: + 0x5aee004 (0x7fdeb69fa004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #18: + 0x5af36b5 (0x7fdeb69ff6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank31]: frame #19: + 0xd2631e (0x7fdec95e931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: frame #16: + 0x1acf4b8 (0x7fea217ac4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: frame #20: + 0x47def4 (0x7fdec8d40ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: frame #17: + 0x5aee004 (0x7fea257cb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: recv_activation_tensor = recv_activation() -[default7]:[rank31]: frame #21: + 0x1445a6 (0x55d125a265a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d125a1fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: return func(*args, **kwargs) -[default7]:[rank31]: frame #23: + 0x150866 (0x55d125a32866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d125a1b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d125a26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: frame #26: PyObject_Call + 0xbc (0x55d125a32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d125a192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d125a26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d125a178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f46aeebafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank31]: frame #30: + 0x150582 (0x55d125a32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d125a178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #32: + 0x150582 (0x55d125a32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d125a178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: frame #34: + 0x150582 (0x55d125a32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d125a178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d125a1ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f46aee6f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f46aee6f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d125a30c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #38: + 0x211239 (0x55d125af3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d125a1fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank31]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d125a1b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d125a26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d125a16c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d125a26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default4]:[rank44]: pipeline_state.run_communication() -[default0]:[rank40]: frame #18: + 0x5af36b5 (0x7fea257d06b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #19: + 0xd2631e (0x7fea383ba31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d125a178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #45: + 0x150582 (0x55d125a32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #46: PyObject_Call + 0xbc (0x55d125a32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d125a192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank31]: frame #48: + 0x150582 (0x55d125a32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #49: PyObject_Call + 0xbc (0x55d125a32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d125a192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: frame #20: + 0x47def4 (0x7fea37b11ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank18]: dist.recv( -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d125a26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: frame #21: + 0x1445a6 (0x55b773bf05a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f46aee6f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f46aee6f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank31]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d125a1f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d125a30c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f467667c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: frame #54: + 0x211239 (0x55d125af3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #55: PyObject_Call + 0x207 (0x55d125a33067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d125a192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #57: + 0x150582 (0x55d125a32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: Traceback (most recent call last): -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank21]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1dc1fed897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank31]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d125a178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: return func(*args, **kwargs) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: sharded_logits = self.model( -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b773be9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4676683610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f46766a2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: frame #59: + 0x150582 (0x55d125a32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #60: PyObject_Call + 0xbc (0x55d125a32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d125a192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #23: + 0x150866 (0x55b773bfc866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: trainer.train(dataloader) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: dist.recv( -[default7]:[rank31]: frame #62: + 0x150582 (0x55d125a32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank31]: frame #63: PyObject_Call + 0xbc (0x55d125a32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b773be5142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b773bf0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #12: + 0x5adc309 (0x7f46aee61309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: dist.recv( -[default5]:[rank21]: frame #1: + 0x5b3a23e (0x7f1dfbb0a23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank62]: frame #13: + 0x5ae6f10 (0x7f46aee6bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank37]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: frame #14: + 0x5ae6fa5 (0x7f46aee6bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: frame #15: + 0x5124446 (0x7f46ae4a9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default5]:[rank21]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f1dfbb04c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: trainer.train(dataloader) -[default2]:[rank42]: pipeline_state.run_communication() -[default0]:[rank40]: frame #26: PyObject_Call + 0xbc (0x55b773bfcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank18]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: output = model(**micro_batch) -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank62]: frame #16: + 0x1acf4b8 (0x7f46aae544b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #17: + 0x5aee004 (0x7f46aee73004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #18: + 0x5af36b5 (0x7f46aee786b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #19: + 0xd2631e (0x7f46c1a6231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank21]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1dfbb04f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: frame #20: + 0x47def4 (0x7f46c11b9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #21: + 0x1445a6 (0x55b168f375a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b168f30a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1dfbb05fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: output = model(**micro_batch) -[default2]:[rank58]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5c39ff5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank58]: frame #1: + 0x5b3a23e (0x7f5c73b1223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5c73b0cc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5c73b0cf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5c73b0dfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #23: + 0x150866 (0x55b168f43866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b168f2c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b168f37a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f60f8d3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5c73ac2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5c73ac2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5c73ac2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5c73ac2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: frame #26: PyObject_Call + 0xbc (0x55b168f43f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b168f2a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b168f37a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b168f288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f17fafae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b773be32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #30: + 0x150582 (0x55b168f43582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b168f288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #1: + 0x5b3a23e (0x7f613285b23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b773bf0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5c3b2cf189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank16]: sharded_logits = self.model( -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5c3b2d6610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank22]: frame #1: + 0x5b3a23e (0x7f1834acb23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5c3b2f5978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #12: + 0x5adc309 (0x7f5c73ab4309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6132855c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6132855f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: frame #13: + 0x5ae6f10 (0x7f5c73abef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f1834ac5c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1dfbaba371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: sharded_logits = self.model( -[default3]:[rank51]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: frame #32: + 0x150582 (0x55b168f43582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1dfbaba371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6132856fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b773be18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank62]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b168f288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1dfbaba371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: frame #34: + 0x150582 (0x55b168f43582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b168f288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1dfbaba371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b168f2ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f1dc32c7189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #30: + 0x150582 (0x55b773bfc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b773be18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #32: + 0x150582 (0x55b773bfc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b168f41c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #38: + 0x211239 (0x55b169004239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1834ac5f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: sharded_logits = self.model( -[default1]:[rank41]: output = model(**micro_batch) -[default4]:[rank44]: return func(*args, **kwargs) -[default6]:[rank62]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b168f30a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b168f2c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f1dc32ce610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b168f37a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b168f27c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1834ac6fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b168f37a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b168f288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f1dc32ed978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank21]: frame #12: + 0x5adc309 (0x7f1dfbaac309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: frame #13: + 0x5ae6f10 (0x7f1dfbab6f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: frame #45: + 0x150582 (0x55b168f43582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f613280b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f613280b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: frame #46: PyObject_Call + 0xbc (0x55b168f43f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b168f2a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #48: + 0x150582 (0x55b168f43582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #49: PyObject_Call + 0xbc (0x55b168f43f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b168f2a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f613280b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b168f37a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b168f30007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f613280b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b168f41c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #54: + 0x211239 (0x55b169004239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1834a7b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: frame #55: PyObject_Call + 0x207 (0x55b168f44067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1834a7b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b773be18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #14: + 0x5ae6fa5 (0x7f5c73abefa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #15: + 0x5124446 (0x7f5c730fc446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1834a7b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default5]:[rank21]: frame #14: + 0x5ae6fa5 (0x7f1dfbab6fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b168f2a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f60fa018189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank62]: frame #57: + 0x150582 (0x55b168f43582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #15: + 0x5124446 (0x7f1dfb0f4446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default0]:[rank40]: frame #34: + 0x150582 (0x55b773bfc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b168f288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #59: + 0x150582 (0x55b168f43582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f60fa01f610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b773be18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #60: PyObject_Call + 0xbc (0x55b168f43f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1834a7b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b168f2a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #16: + 0x1acf4b8 (0x7f5c6faa74b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: frame #16: + 0x1acf4b8 (0x7f1df7a9f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default6]:[rank62]: frame #62: + 0x150582 (0x55b168f43582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #63: PyObject_Call + 0xbc (0x55b168f43f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank21]: frame #17: + 0x5aee004 (0x7f1dfbabe004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: frame #17: + 0x5aee004 (0x7f5c73ac6004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #18: + 0x5af36b5 (0x7f5c73acb6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #19: + 0xd2631e (0x7f5c866b531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #20: + 0x47def4 (0x7f5c85e0cef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #21: + 0x1445a6 (0x55de096155a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #18: + 0x5af36b5 (0x7f1dfbac36b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f17fc288189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: dist.recv( -[default2]:[rank58]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55de0960ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #23: + 0x150866 (0x55de09621866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55de0960a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55de09615a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #26: PyObject_Call + 0xbc (0x55de09621f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f17fc28f610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank29]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55de096082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55de09615a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55de096068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #30: + 0x150582 (0x55de09621582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55de096068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #19: + 0xd2631e (0x7f1e0e6ad31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank54]: dist.recv( -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b773be8f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #32: + 0x150582 (0x55de09621582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55de096068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #34: + 0x150582 (0x55de09621582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55de096068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f60fa03e978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7f9d545897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank58]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55de0960df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55de0961fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #38: + 0x211239 (0x55de096e2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f17fc2ae978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default5]:[rank29]: pipeline_state.run_communication() -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: frame #1: + 0x5b3a23e (0x7f7fd706223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55de0960ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55de0960a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55de09615a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55de09605c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #20: + 0x47def4 (0x7f1e0de04ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b773bfac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55de09615a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55de096068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #45: + 0x150582 (0x55de09621582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #46: PyObject_Call + 0xbc (0x55de09621f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #21: + 0x1445a6 (0x55b9930405a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #12: + 0x5adc309 (0x7f1834a6d309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f7fd705cc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55de096082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #48: + 0x150582 (0x55de09621582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #49: PyObject_Call + 0xbc (0x55de09621f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55de096082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #13: + 0x5ae6f10 (0x7f1834a77f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55de09615a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55de0960e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55de0961fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b993039a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank30]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7bc2b2a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank51]: pipeline_state.run_communication() -[default6]:[rank54]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #54: + 0x211239 (0x55de096e2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #55: PyObject_Call + 0x207 (0x55de09622067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55de096082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #23: + 0x150866 (0x55b99304c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #1: + 0x5b3a23e (0x7f7bfc64723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0e94a51897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: frame #57: + 0x150582 (0x55de09621582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55de096068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #59: + 0x150582 (0x55de09621582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #60: PyObject_Call + 0xbc (0x55de09621f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: Traceback (most recent call last): -[default6]:[rank30]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f7bfc641c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #1: + 0x5b3a23e (0x7f0ece56e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0ece568c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7fd705cf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55de096082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #62: + 0x150582 (0x55de09621582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #63: PyObject_Call + 0xbc (0x55de09621f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0ece568f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0ece569fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: frame #38: + 0x211239 (0x55b773cbd239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank19]: trainer.train(dataloader) -[default6]:[rank30]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7bfc641f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0ece51e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b773be9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: frame #14: + 0x5ae6fa5 (0x7f1834a77fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #12: + 0x5adc309 (0x7f61327fd309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0ece51e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0ece51e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0ece51e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7bfc642fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b773be53e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b993035142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank18]: frame #13: + 0x5ae6f10 (0x7f6132807f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7bfc5f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7bfc5f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f0e95d2b189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7fd705dfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default6]:[rank22]: frame #15: + 0x5124446 (0x7f18340b5446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #16: + 0x1acf4b8 (0x7f1830a604b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7bfc5f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7bfc5f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b773bf0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #17: + 0x5aee004 (0x7f1834a7f004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: frame #14: + 0x5ae6fa5 (0x7f6132807fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f7bc3e04189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0e95d32610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: frame #15: + 0x5124446 (0x7f6131e45446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #18: + 0x5af36b5 (0x7f1834a846b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7fd7012371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7fd7012371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: frame #16: + 0x1acf4b8 (0x7f612e7f04b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f7bc3e0b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f7bc3e2a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank54]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0e95d51978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: dist.recv( -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank21]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b993040a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #26: PyObject_Call + 0xbc (0x55b99304cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #12: + 0x5adc309 (0x7f7bfc5e9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: dist.recv( -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: frame #17: + 0x5aee004 (0x7f613280f004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #18: + 0x5af36b5 (0x7f61328146b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: frame #12: + 0x5adc309 (0x7f0ece510309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b773be0c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b9930332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank30]: frame #13: + 0x5ae6f10 (0x7f7bfc5f3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default4]:[rank44]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7fd7012371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank54]: frame #13: + 0x5ae6f10 (0x7f0ece51af10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: frame #19: + 0xd2631e (0x7f184766e31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank29]: dist.recv( -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7fd7012371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f7f9e81f189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: frame #19: + 0xd2631e (0x7f61453fe31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank30]: frame #14: + 0x5ae6fa5 (0x7f7bfc5f3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank44]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f7f9e826610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank18]: frame #20: + 0x47def4 (0x7f6144b55ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default3]:[rank51]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: dist.recv( -[default0]:[rank40]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b773bf0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default2]:[rank18]: frame #21: + 0x1445a6 (0x5599a56255a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank22]: frame #20: + 0x47def4 (0x7f1846dc5ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank21]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b993040a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: frame #14: + 0x5ae6fa5 (0x7f0ece51afa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank29]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank51]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1068fe7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default0]:[rank40]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b773be18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #45: + 0x150582 (0x55b773bfc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank29]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76f04cb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank54]: frame #15: + 0x5124446 (0x7f0ecdb58446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f7f9e845978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank21]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b9930318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #15: + 0x5124446 (0x7f7bfbc31446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #16: + 0x1acf4b8 (0x7f0eca5034b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #1: + 0x5b3a23e (0x7f10a2b0423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: pipeline_state.run_communication() -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5599a561ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #16: + 0x1acf4b8 (0x7f7bf85dc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f10a2afec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: frame #17: + 0x5aee004 (0x7f7bfc5fb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: frame #18: + 0x5af36b5 (0x7f7bfc6006b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f10a2afef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #17: + 0x5aee004 (0x7f0ece522004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: frame #46: PyObject_Call + 0xbc (0x55b773bfcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: output = model(**micro_batch) -[default6]:[rank30]: frame #19: + 0xd2631e (0x7f7c0f1ea31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f10a2afffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default0]:[rank40]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b773be32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #30: + 0x150582 (0x55b99304c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #1: + 0x5b3a23e (0x7f7729fe823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #18: + 0x5af36b5 (0x7f0ece5276b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: frame #48: + 0x150582 (0x55b773bfc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #23: + 0x150866 (0x5599a5631866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #20: + 0x47def4 (0x7f7c0e941ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f10a2ab4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank44]: frame #12: + 0x5adc309 (0x7f7fd7004309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: frame #21: + 0x1445a6 (0x561fd02b45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #22: _PyObject_MakeTpCall + 0x26b (0x561fd02ada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #19: + 0xd2631e (0x7f0ee111131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f7729fe2c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #20: + 0x47def4 (0x7f0ee0868ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6398f37897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank18]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5599a561a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5599a5625a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #23: + 0x150866 (0x561fd02c0866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #21: + 0x1445a6 (0x563b114f15a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563b114eaa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: frame #49: PyObject_Call + 0xbc (0x55b773bfcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b773be32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #21: + 0x1445a6 (0x5651896a65a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7729fe2f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #23: + 0x150866 (0x563b114fd866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default0]:[rank40]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b773bf0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: frame #1: + 0x5b3a23e (0x7f63d2a5423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x561fd02a9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #25: _PyFunction_Vectorcall + 0x6c (0x561fd02b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f10a2ab4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0c87363897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: frame #13: + 0x5ae6f10 (0x7f7fd700ef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b9930318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #26: PyObject_Call + 0xbc (0x561fd02c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f10a2ab4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563b114e6142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: frame #26: PyObject_Call + 0xbc (0x5599a5631f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a56182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x561fd02a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #28: _PyFunction_Vectorcall + 0x6c (0x561fd02b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x561fd02a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f10a2ab4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc3ca0c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f63d2a4ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default6]:[rank30]: frame #30: + 0x150582 (0x561fd02c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x561fd02a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f106a2c1189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #1: + 0x5b3a23e (0x7f0cc0e8023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: frame #32: + 0x150582 (0x55b99304c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #32: + 0x150582 (0x561fd02c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x561fd02a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f106a2c8610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b773be9007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f63d2a4ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b9930318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #34: + 0x150582 (0x561fd02c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563b114f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #1: + 0x5b3a23e (0x7fc403be323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f63d2a4ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b773bfac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5599a5625a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5599a56168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x561fd02a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x561fd02acf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #26: PyObject_Call + 0xbc (0x563b114fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0cc0e7ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank29]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7729fe3fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7729f98371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563b114e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f106a2e7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0cc0e7af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0515450897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank40]: frame #54: + 0x211239 (0x55b773cbd239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7729f98371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #12: + 0x5adc309 (0x7f10a2aa6309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #13: + 0x5ae6f10 (0x7f10a2ab0f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #1: + 0x5b3a23e (0x7f054ef6d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: frame #55: PyObject_Call + 0x207 (0x55b773bfd067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: output = model(**micro_batch) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: frame #37: _PyObject_Call_Prepend + 0x69 (0x561fd02bec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563b114f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fc403bddc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: dist.recv( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56518969fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #30: + 0x150582 (0x5599a5631582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7729f98371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7729f98371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563b114e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0cc0e7bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default0]:[rank40]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b773be32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #34: + 0x150582 (0x55b99304c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b9930318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f76f17a5189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #14: + 0x5ae6fa5 (0x7f10a2ab0fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0cc0e30371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f63d2a04371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b993038f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f76f17ac610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank29]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f76f17cb978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #30: + 0x150582 (0x563b114fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fc403bddf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f63d2a04371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank30]: frame #38: + 0x211239 (0x561fd0381239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563b114e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #32: + 0x150582 (0x563b114fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default0]:[rank32]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0cc0e30371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f63d2a04371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5599a56168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #12: + 0x5adc309 (0x7f7729f8a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563b114e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #34: + 0x150582 (0x563b114fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fc403bdefd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc403b93371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank18]: frame #32: + 0x150582 (0x5599a5631582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b99304ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #13: + 0x5ae6f10 (0x7f7729f94f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank30]: frame #39: _PyObject_MakeTpCall + 0x26b (0x561fd02ada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #15: + 0x5124446 (0x7f10a20ee446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0cc0e30371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #14: + 0x5ae6fa5 (0x7f7fd700efa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #15: + 0x5124446 (0x7f7fd664c446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: pipeline_state.run_communication() -[default6]:[rank30]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x561fd02a93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #41: _PyFunction_Vectorcall + 0x6c (0x561fd02b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563b114e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f63d2a04371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #16: + 0x1acf4b8 (0x7f7fd2ff74b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x561fd02a4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563b114e9f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0cc0e30371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f0c8863d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: return func(*args, **kwargs) -[default0]:[rank40]: frame #57: + 0x150582 (0x55b773bfc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b773be18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #38: + 0x211239 (0x55b99310d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #43: _PyFunction_Vectorcall + 0x6c (0x561fd02b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563b114fbc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #38: + 0x211239 (0x563b115be239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563b114eaa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc403b93371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank18]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5599a56168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x561fd02a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #45: + 0x150582 (0x561fd02c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563b114e63e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa35af10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank18]: frame #34: + 0x150582 (0x5599a5631582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #46: PyObject_Call + 0xbc (0x561fd02c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #14: + 0x5ae6fa5 (0x7f7729f94fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563b114f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563b114e1c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563b114f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0c88644610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #59: + 0x150582 (0x55b773bfc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank30]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x561fd02a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #16: + 0x1acf4b8 (0x7f109ea994b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0c88663978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #17: + 0x5aee004 (0x7f7fd7016004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #18: + 0x5af36b5 (0x7f7fd701b6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #23: + 0x150866 (0x5651896b2866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #48: + 0x150582 (0x561fd02c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #49: PyObject_Call + 0xbc (0x561fd02c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563b114e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #45: + 0x150582 (0x563b114fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f054ef67c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: pipeline_state.run_communication() -[default6]:[rank22]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56518969b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x561fd02a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #46: PyObject_Call + 0xbc (0x563b114fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563b114e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #48: + 0x150582 (0x563b114fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #49: PyObject_Call + 0xbc (0x563b114fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #1: + 0x5b3a23e (0x7fa394a2d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa394a27c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: frame #60: PyObject_Call + 0xbc (0x55b773bfcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: frame #15: + 0x5124446 (0x7f77295d2446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563b114e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #12: + 0x5adc309 (0x7f0cc0e22309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #13: + 0x5ae6f10 (0x7f0cc0e2cf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b773be32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b993039a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #51: _PyFunction_Vectorcall + 0x6c (0x561fd02b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x561fd02ad007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563b114f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563b114ea007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f054ef67f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b9930353e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #53: _PyObject_Call_Prepend + 0x69 (0x561fd02bec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563b114fbc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f054ef68fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc403b93371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #14: + 0x5ae6fa5 (0x7f0cc0e2cfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f639a211189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank21]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b993040a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: frame #54: + 0x211239 (0x561fd0381239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #55: PyObject_Call + 0x207 (0x561fd02c1067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x561fd02a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #17: + 0x5aee004 (0x7f10a2ab8004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #18: + 0x5af36b5 (0x7f10a2abd6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f054ef1d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa394a27f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #19: + 0xd2631e (0x7f7fe9c0531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #20: + 0x47def4 (0x7f7fe935cef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: frame #16: + 0x1acf4b8 (0x7f7725f7d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank29]: frame #17: + 0x5aee004 (0x7f7729f9c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #19: + 0xd2631e (0x7f10b56a731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #20: + 0x47def4 (0x7f10b4dfeef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa394a28fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #62: + 0x150582 (0x55b773bfc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #63: PyObject_Call + 0xbc (0x55b773bfcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5599a56168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5599a561df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #57: + 0x150582 (0x561fd02c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x561fd02a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #54: + 0x211239 (0x563b115be239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #15: + 0x5124446 (0x7f0cc046a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5599a562fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #59: + 0x150582 (0x561fd02c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #21: + 0x1445a6 (0x562a6085d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa3949dd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default0]:[rank40]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank44]: frame #21: + 0x1445a6 (0x5632243625a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56322435ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5651896a6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #26: PyObject_Call + 0xbc (0x5651896b2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #18: + 0x5af36b5 (0x7f7729fa16b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #55: PyObject_Call + 0x207 (0x563b114fe067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f054ef1d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f054ef1d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f639a218610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f639a237978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank21]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b993030c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b993040a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #60: PyObject_Call + 0xbc (0x561fd02c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x561fd02a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #62: + 0x150582 (0x561fd02c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank30]: frame #63: PyObject_Call + 0xbc (0x561fd02c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563b114e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562a60856a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f054ef1d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa3949dd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa3949dd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default6]:[rank30]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank54]: frame #57: + 0x150582 (0x563b114fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #16: + 0x1acf4b8 (0x7f0cbce154b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #17: + 0x5aee004 (0x7f0cc0e34004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc403b93371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank18]: frame #38: + 0x211239 (0x5599a56f2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5599a561ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #19: + 0xd2631e (0x7f773cb8b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank29]: frame #20: + 0x47def4 (0x7f773c2e2ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank29]: frame #21: + 0x1445a6 (0x5650605be5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563b114e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa3949dd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #12: + 0x5adc309 (0x7f63d29f6309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: recv_activation_tensor = recv_activation() -[default5]:[rank29]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5650605b7a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #23: + 0x150866 (0x5650605ca866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5650605b3142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #23: + 0x150866 (0x562a60869866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x562a60852142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #18: + 0x5af36b5 (0x7f0cc0e396b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #23: + 0x150866 (0x56322436e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563224357142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5650605bea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #26: PyObject_Call + 0xbc (0x5650605caf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5650605b12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562a6085da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #26: PyObject_Call + 0xbc (0x562a60869f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #19: + 0xd2631e (0x7f0cd3a2331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f051672a189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0516731610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563224362a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #13: + 0x5ae6f10 (0x7f63d2a00f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b9930318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #45: + 0x150582 (0x55b99304c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5650605bea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5650605af8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #30: + 0x150582 (0x5650605ca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5650605af8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x562a608502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa35c1ea189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank29]: frame #32: + 0x150582 (0x5650605ca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5650605af8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #34: + 0x150582 (0x5650605ca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5650605af8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562a6085da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #59: + 0x150582 (0x563b114fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa35c1f1610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #26: PyObject_Call + 0xbc (0x56322436ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: sharded_logits = self.model( -[default5]:[rank29]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5650605b6f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5650605c8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #38: + 0x211239 (0x56506068b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5650605b7a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x562a6084e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #60: PyObject_Call + 0xbc (0x563b114fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563b114e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fc3cb3a0189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fc3cb3a7610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5650605b33e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5650605bea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5650605aec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #30: + 0x150582 (0x562a60869582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #20: + 0x47def4 (0x7f0cd317aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5632243552b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563224362a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5599a561a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5599a5625a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5650605bea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5650605af8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #45: + 0x150582 (0x5650605ca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #46: PyObject_Call + 0xbc (0x5650605caf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #62: + 0x150582 (0x563b114fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #63: PyObject_Call + 0xbc (0x563b114fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fc3cb3c6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5632243538fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fada0ac0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank19]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5650605b12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #48: + 0x150582 (0x5650605ca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #49: PyObject_Call + 0xbc (0x5650605caf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5650605b12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank37]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa35c210978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #30: + 0x150582 (0x56322436e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: frame #14: + 0x5ae6fa5 (0x7f63d2a00fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5650605bea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5650605b7007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5650605c8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x562a6084e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #32: + 0x150582 (0x562a60869582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x562a6084e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #21: + 0x1445a6 (0x55d1f2c5e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5632243538fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #1: + 0x5b3a23e (0x7fadda5dd23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: frame #54: + 0x211239 (0x56506068b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #55: PyObject_Call + 0x207 (0x5650605cb067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5650605b12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #57: + 0x150582 (0x5650605ca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #34: + 0x150582 (0x562a60869582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x562a6084e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562a60855f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d1f2c57a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #12: + 0x5adc309 (0x7fc403b85309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0516750978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fadda5d7c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5650605af8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #59: + 0x150582 (0x5650605ca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #60: PyObject_Call + 0xbc (0x5650605caf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5650605b12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562a60867c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #38: + 0x211239 (0x562a6092a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562a60856a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x562a608523e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #12: + 0x5adc309 (0x7f054ef0f309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #32: + 0x150582 (0x56322436e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5651896992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #62: + 0x150582 (0x5650605ca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: frame #63: PyObject_Call + 0xbc (0x5650605caf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank29]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562a6085da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562a6084dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562a6085da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x562a6084e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #23: + 0x150866 (0x55d1f2c6a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fadda5d7f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5651896a6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #45: + 0x150582 (0x562a60869582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #46: PyObject_Call + 0xbc (0x562a60869f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x562a608502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #48: + 0x150582 (0x562a60869582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #49: PyObject_Call + 0xbc (0x562a60869f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #13: + 0x5ae6f10 (0x7f054ef19f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #12: + 0x5adc309 (0x7fa3949cf309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #13: + 0x5ae6f10 (0x7fa3949d9f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5632243538fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #15: + 0x5124446 (0x7f63d203e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5651896978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5599a5615c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x562a608502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562a6085da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562a60856007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #13: + 0x5ae6f10 (0x7fc403b8ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #16: + 0x1acf4b8 (0x7f63ce9e94b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562a60867c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #54: + 0x211239 (0x562a6092a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #55: PyObject_Call + 0x207 (0x562a6086a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x562a608502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d1f2c53142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #34: + 0x150582 (0x56322436e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: frame #57: + 0x150582 (0x562a60869582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x562a6084e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #59: + 0x150582 (0x562a60869582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #60: PyObject_Call + 0xbc (0x562a60869f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x562a608502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #62: + 0x150582 (0x562a60869582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #63: PyObject_Call + 0xbc (0x562a60869f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/pyt[default0]:[rank32]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d1f2c5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank42]: frame #17: + 0x5aee004 (0x7f63d2a08004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: frame #46: PyObject_Call + 0xbc (0x55b99304cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -hon3.10) -[default3]:[rank51]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: frame #14: + 0x5ae6fa5 (0x7fc403b8ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5632243538fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fadda5d8fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #30: + 0x150582 (0x5651896b2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default2]:[rank34]: frame #15: + 0x5124446 (0x7fc4031cd446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fadda58d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fadda58d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default3]:[rank35]: frame #14: + 0x5ae6fa5 (0x7f054ef19fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56322435af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #18: + 0x5af36b5 (0x7f63d2a0d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default3]:[rank35]: frame #15: + 0x5124446 (0x7f054e557446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #16: + 0x1acf4b8 (0x7fc3ffb784b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #14: + 0x5ae6fa5 (0x7fa3949d9fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #15: + 0x5124446 (0x7fa394017446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fadda58d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communi[default2]:[rank34]: frame #17: + 0x5aee004 (0x7fc403b97004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #19: + 0xd2631e (0x7f63e55f731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank22]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5651896978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #32: + 0x150582 (0x5651896b2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -cation -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: di[default5]:[rank37]: frame #16: + 0x1acf4b8 (0x7fa3909c24b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #16: + 0x1acf4b8 (0x7f054af024b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fadda58d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5599a5625a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -st.recv( -[default0]:[rank32]: frame #26: PyObject_Call + 0xbc (0x55d1f2c6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fada1d9a189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56322436cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5599a56168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default1]:[rank49]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank49]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faa369a9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank32]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1f2c512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #38: + 0x211239 (0x56322442f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #20: + 0x47def4 (0x7f63e4d4eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank21]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b9930332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #1: + 0x5b3a23e (0x7faa704c623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7faa704c0c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7faa704c0f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #17: + 0x5aee004 (0x7f054ef21004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #21: + 0x1445a6 (0x5580c86735a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #48: + 0x150582 (0x55b99304c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank22]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5651896978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #34: + 0x150582 (0x5651896b2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7faa704c1fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faa70476371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faa70476371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faa70476371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faa70476371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libt[default3]:[rank35]: frame #18: + 0x5af36b5 (0x7f054ef266b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fada1da1610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank21]: frame #49: PyObject_Call + 0xbc (0x55b99304cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -orch_cpu.so) -[default2]:[rank34]: frame #18: + 0x5af36b5 (0x7fc403b9c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56322435ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5632243573e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: dist.recv( -[default1]:[rank49]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7faa37c83189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7faa37c8a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7faa37ca9978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #12: + 0x5adc309 (0x7faa70468309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #13: [default2]:[rank34]: frame #19: + 0xd2631e (0x7fc41678631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fada1dc0978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -+ 0x5ae6f10 (0x7faa70472f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #17: + 0x5aee004 (0x7fa3949e1004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #18: + 0x5af36b5 (0x7fa3949e66b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #19: + 0xd2631e (0x7fa3a75d031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #12: + 0x5adc309 (0x7fadda57f309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #45: + 0x150582 (0x5599a5631582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #14: + 0x5ae6fa5 (0x7faa70472fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #15: + 0x5124446 (0x7faa6fab0446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #16: + 0x1acf4b8 (0x7faa6c45b4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #17: + 0x5aee004 (0x7faa7047a004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #20: + 0x47def4 (0x7fc415eddef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #19: + 0xd2631e (0x7f0561b1031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #20: + 0x47def4 (0x7f0561267ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #13: + 0x5ae6f10 (0x7fadda589f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563224362a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: frame #18: + 0x5af36b5 (0x7faa7047f6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #19: + 0xd2631e (0x7faa8306931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #20: + 0x47def4 (0x7faa827c0ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #21: + 0x1445a6 (0x559ec0b525a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d1f2c5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d1f2c4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563224352c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default1]:[rank49]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559ec0b4ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #23: + 0x150866 (0x559ec0b5e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559ec0b47142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559ec0b52a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #26: PyObject_Call + 0xbc (0x559ec0b5ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #21: + 0x1445a6 (0x55e5c85f25a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55e5c85eba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: dist.recv( -[default2]:[rank18]: frame #46: PyObject_Call + 0xbc (0x5599a5631f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559ec0b452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559ec0b52a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559ec0b438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #30: + 0x150582 (0x559ec0b5e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #23: + 0x150866 (0x55e5c85fe866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #30: + 0x150582 (0x55d1f2c6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d1f2c4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #21: + 0x1445a6 (0x5599a4d8f5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a56182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b9930332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559ec0b438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #32: + 0x150582 (0x559ec0b5e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559ec0b438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #34: + 0x150582 (0x559ec0b5e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55e5c85e7142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #20: + 0x47def4 (0x7fa3a6d27ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #32: + 0x150582 (0x55d1f2c6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559ec0b438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559ec0b4af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559ec0b5cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #38: + 0x211239 (0x559ec0c1f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55e5c85f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #26: PyObject_Call + 0xbc (0x55e5c85fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563224362a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559ec0b4ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559ec0b473e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559ec0b52a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55e5c85e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5599a4d88a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5632243538fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5651896978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56518969ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559ec0b42c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559ec0b52a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559ec0b438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #45: + 0x150582 (0x559ec0b5e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #23: + 0x150866 (0x5599a4d9b866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5580c866ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b993040a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #46: PyObject_Call + 0xbc (0x559ec0b5ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559ec0b452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #48: + 0x150582 (0x559ec0b5e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #49: PyObject_Call + 0xbc (0x559ec0b5ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55e5c85f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d1f2c4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #21: + 0x1445a6 (0x55d5815b45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #23: + 0x150866 (0x5580c867f866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559ec0b452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559ec0b52a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559ec0b4b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559ec0b5cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55e5c85e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #30: + 0x150582 (0x55e5c85fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #45: + 0x150582 (0x56322436e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #48: + 0x150582 (0x5599a5631582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5651896b0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #38: + 0x211239 (0x565189773239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #54: + 0x211239 (0x559ec0c1f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #55: PyObject_Call + 0x207 (0x559ec0b5f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559ec0b452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #57: + 0x150582 (0x559ec0b5e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55e5c85e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #46: PyObject_Call + 0xbc (0x56322436ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: return func(*args, **kwargs) -[default2]:[rank18]: frame #49: PyObject_Call + 0xbc (0x5599a5631f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559ec0b438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #59: + 0x150582 (0x559ec0b5e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #60: PyObject_Call + 0xbc (0x559ec0b5ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559ec0b452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d5815ada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #23: + 0x150866 (0x55d5815c0866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5580c8668142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a56182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: pipeline_state.run_communication() -[default1]:[rank49]: frame #62: + 0x150582 (0x559ec0b5e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #63: PyObject_Call + 0xbc (0x559ec0b5ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank35]: frame #32: + 0x150582 (0x55e5c85fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5580c8673a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56518969fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56518969b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5599a4d84142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5599a4d8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #34: + 0x150582 (0x55d1f2c6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5632243552b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #48: + 0x150582 (0x56322436e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5651896a6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d5815a9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d5815b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #26: PyObject_Call + 0xbc (0x5580c867ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b993039007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b99304ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d1f2c4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5580c86662b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d1f2c56f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #49: PyObject_Call + 0xbc (0x56322436ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default3]:[rank35]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55e5c85e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #34: + 0x150582 (0x55e5c85fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5632243552b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default0]:[rank32]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d1f2c68c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #38: + 0x211239 (0x55d1f2d2b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #26: PyObject_Call + 0xbc (0x55d5815c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #14: + 0x5ae6fa5 (0x7fadda589fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5599a5625a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5599a561e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: frame #26: PyObject_Call + 0xbc (0x5599a4d9bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #15: + 0x5124446 (0x7fadd9bc7446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: frame #54: + 0x211239 (0x55b99310d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a4d822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55e5c85e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55e5c85eaf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563224362a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d5815a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d5815b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56322435b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x565189696c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default3]:[rank35]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55e5c85fcc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5580c8673a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d1f2c57a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5580c86648fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5599a562fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d1f2c533e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #38: + 0x211239 (0x55e5c86bf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55e5c85eba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: frame #54: + 0x211239 (0x5599a56f2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55e5c85e73e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d1f2c5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d5815a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: torch.distributed.DistBackendError: [2] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '1:2', but store->get('1:2') got error: Connection reset by peer -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5599a4d8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #16: + 0x1acf4b8 (0x7fadd65724b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5651896a6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: [3] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '2:3', but store->get('2:3') got error: Connection reset by peer -[default3]:[rank35]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55e5c85f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5599a4d808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #30: + 0x150582 (0x5580c867f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5580c86648fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank52]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f759e8b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank34]: frame #30: + 0x150582 (0x5599a4d9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5599a4d808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank18]: frame #55: PyObject_Call + 0x207 (0x5599a5632067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a56182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #1: + 0x5b3a23e (0x7f75d83d423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f75d83cec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55e5c85e2c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55e5c85f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #17: + 0x5aee004 (0x7fadda591004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f75d83cef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f75d83cffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f75d8384371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #32: + 0x150582 (0x5599a4d9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #32: + 0x150582 (0x5580c867f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #55: PyObject_Call + 0x207 (0x55b99304d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #57: + 0x150582 (0x5599a5631582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5599a56168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f75d8384371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f75d8384371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f75d8384371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5599a4d808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #34: + 0x150582 (0x5599a4d9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2bc23b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank18]: frame #59: + 0x150582 (0x5599a5631582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f759fb91189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f759fb98610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f759fbb7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5599a4d808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #18: + 0x5af36b5 (0x7fadda5966b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: frame #12: + 0x5adc309 (0x7f75d8376309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #13: + 0x5ae6f10 (0x7f75d8380f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d1f2c4ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5580c86648fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank16]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb28a79b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: frame #14: + 0x5ae6fa5 (0x7f75d8380fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #15: + 0x5124446 (0x7f75d79be446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #16: + 0x1acf4b8 (0x7f75d43694b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #17: + 0x5aee004 (0x7f75d8388004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d1f2c5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55e5c85e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #1: + 0x5b3a23e (0x7f2bfbecf23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5651896978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #18: + 0x5af36b5 (0x7f75d838d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #19: + 0xd2631e (0x7f75eaf7731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #20: + 0x47def4 (0x7f75ea6ceef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #45: + 0x150582 (0x55e5c85fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2bfbec9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56322436cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #54: + 0x211239 (0x56322442f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default4]:[rank52]: frame #21: + 0x1445a6 (0x5600f668b5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5600f6684a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #23: + 0x150866 (0x5600f6697866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5600f6680142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5600f668ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #26: PyObject_Call + 0xbc (0x5600f6697f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5600f667e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster[default2]:[rank34]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5599a4d87f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5599a4d99c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #55: PyObject_Call + 0x207 (0x56322436f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #60: PyObject_Call + 0xbc (0x5599a5631f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #1: + 0x5b3a23e (0x7fb2c42b823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb2c42b2c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -/bin/python3.10) -[default3]:[rank35]: frame #46: PyObject_Call + 0xbc (0x55e5c85fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55e5c85e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #19: + 0xd2631e (0x7faded18031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5600f668ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5600f667c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #30: + 0x150582 (0x5600f6697582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5600f667c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #32: + 0x150582 (0x5600f6697582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #48: + 0x150582 (0x55e5c85fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #30: + 0x150582 (0x55d5815c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d5815a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #20: + 0x47def4 (0x7fadec8d7ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank19]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5600f667c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #34: + 0x150582 (0x5600f6697582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5600f667c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5600f6683f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #49: PyObject_Call + 0xbc (0x55e5c85fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5632243552b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #57: + 0x150582 (0x56322436e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #45: + 0x150582 (0x5651896b2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5600f6695c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #38: + 0x211239 (0x5600f6758239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5600f6684a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5600f66803e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5600f668ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d1f2c4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2bfbec9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a56182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5600f667bc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5600f668ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5600f667c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #45: + 0x150582 (0x5600f6697582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #45: + 0x150582 (0x55d1f2c6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5632243538fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #59: + 0x150582 (0x56322436e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank16]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb2c42b2f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #46: PyObject_Call + 0xbc (0x5600f6697f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5600f667e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #48: + 0x150582 (0x5600f6697582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #49: PyObject_Call + 0xbc (0x5600f6697f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #38: + 0x211239 (0x5599a4e5c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55e5c85e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #60: PyObject_Call + 0xbc (0x56322436ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f2bfbecafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5600f667e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5600f668ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5600f6684007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55e5c85f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #32: + 0x150582 (0x55d5815c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5632243552b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #62: + 0x150582 (0x56322436e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #46: PyObject_Call + 0xbc (0x5651896b2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5600f6695c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #54: + 0x211239 (0x5600f6758239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #55: PyObject_Call + 0x207 (0x5600f6698067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5600f667e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55e5c85eb007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2bfbe7f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5651896992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: frame #62: + 0x150582 (0x5599a5631582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #57: + 0x150582 (0x5600f6697582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5600f667c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #59: + 0x150582 (0x5600f6697582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #60: PyObject_Call + 0xbc (0x5600f6697f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5599a4d88a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2bfbe7f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank18]: frame #63: PyObject_Call + 0xbc (0x5599a5631f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank52]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5600f667e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #62: + 0x150582 (0x5600f6697582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #63: PyObject_Call + 0xbc (0x5600f6697f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank32]: frame #46: PyObject_Call + 0xbc (0x55d1f2c6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d5815a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #34: + 0x150582 (0x55d5815c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #63: PyObject_Call + 0xbc (0x56322436ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: dist.recv( -[default2]:[rank34]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5599a4d843e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #21: + 0x1445a6 (0x55bf533655a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank18]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank37]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d5815a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55bf5335ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2bfbe7f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb2c42b3fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d5815acf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d5815bec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #34: + 0x150582 (0x5580c867f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb2c4268371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #38: + 0x211239 (0x55d581681239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5580c86648fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank16]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb2c4268371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1f2c512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2bfbe7f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2bc368c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5599a4d8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5599a4d7fc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5580c866bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb2c4268371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb2c4268371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d5815ada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2bc3693610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank19]: pipeline_state.run_communication() -[default5]:[rank21]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b9930332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55e5c85fcc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #54: + 0x211239 (0x55e5c86bf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #23: + 0x150866 (0x55bf53371866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #48: + 0x150582 (0x5651896b2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #49: PyObject_Call + 0xbc (0x5651896b2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d5815a93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5580c867dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2bc36b2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank16]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb28ba75189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d5815b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55bf5335a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: return func(*args, **kwargs) -[default2]:[rank34]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5599a4d8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #12: + 0x5adc309 (0x7f2bfbe71309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5651896992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #55: PyObject_Call + 0x207 (0x55e5c85ff067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #38: + 0x211239 (0x5580c8740239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5651896a6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d5815a4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55bf53365a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #13: + 0x5ae6f10 (0x7f2bfbe7bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d5815b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5580c866ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb28ba7c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55e5c85e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #57: + 0x150582 (0x55e5c85fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5580c86683e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb28ba9b978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank19]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5599a4d808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #14: + 0x5ae6fa5 (0x7f2bfbe7bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank21]: frame #57: + 0x150582 (0x55b99304c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #45: + 0x150582 (0x5599a4d9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5580c8673a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: frame #48: + 0x150582 (0x55d1f2c6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #49: PyObject_Call + 0xbc (0x55d1f2c6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d5815a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #26: PyObject_Call + 0xbc (0x55bf53371f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b9930318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #59: + 0x150582 (0x55b99304c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #46: PyObject_Call + 0xbc (0x5599a4d9bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #15: + 0x5124446 (0x7f2bfb4b9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #12: + 0x5adc309 (0x7fb2c425a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #13: + 0x5ae6f10 (0x7fb2c4264f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a4d822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5580c8663c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: frame #45: + 0x150582 (0x55d5815c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55e5c85e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #16: + 0x1acf4b8 (0x7f2bf7e644b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank22]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56518969f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #60: PyObject_Call + 0xbc (0x55b99304cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: frame #59: + 0x150582 (0x55e5c85fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5580c8673a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b9930332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1f2c512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d1f2c5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #48: + 0x150582 (0x5599a4d9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #17: + 0x5aee004 (0x7f2bfbe83004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank21]: frame #62: + 0x150582 (0x55b99304c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #46: PyObject_Call + 0xbc (0x55d5815c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf533582b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5651896b0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d5815a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #60: PyObject_Call + 0xbc (0x55e5c85fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55e5c85e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55bf53365a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #18: + 0x5af36b5 (0x7f2bfbe886b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5580c86648fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #14: + 0x5ae6fa5 (0x7fb2c4264fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #62: + 0x150582 (0x55e5c85fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #19: + 0xd2631e (0x7f2c0ea7231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank23]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank35]: frame #63: PyObject_Call + 0xbc (0x55e5c85fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #20: + 0x47def4 (0x7f2c0e1c9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank23]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank22]: frame #54: + 0x211239 (0x565189773239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d1f2c57007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #48: + 0x150582 (0x55d5815c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #49: PyObject_Call + 0xbc (0x55d5815c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d5815a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55bf533568fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #55: PyObject_Call + 0x207 (0x5651896b3067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #49: PyObject_Call + 0xbc (0x5599a4d9bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #21: + 0x1445a6 (0x55626f3c35a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #45: + 0x150582 (0x5580c867f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8aa156897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank32]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d1f2c68c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #54: + 0x211239 (0x55d1f2d2b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #30: + 0x150582 (0x55bf53371582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #1: + 0x5b3a23e (0x7fa8e3c7323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a4d822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55626f3bca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: frame #63: PyObject_Call + 0xbc (0x55b99304cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d5815b4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #55: PyObject_Call + 0x207 (0x55d1f2c6b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55bf533568fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank21]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: frame #15: + 0x5124446 (0x7fb2c38a2446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5599a4d8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5599a4d88007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #23: + 0x150866 (0x55626f3cf866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #46: PyObject_Call + 0xbc (0x5580c867ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5651896992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d5815ad007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #32: + 0x150582 (0x55bf53371582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5580c86662b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5599a4d99c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55bf533568fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank16]: frame #16: + 0x1acf4b8 (0x7fb2c024d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa8e3c6dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1f2c512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d5815bec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #48: + 0x150582 (0x5580c867f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #49: PyObject_Call + 0xbc (0x5580c867ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa8e3c6df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #54: + 0x211239 (0x5599a4e5c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #55: PyObject_Call + 0x207 (0x5599a4d9c067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #34: + 0x150582 (0x55bf53371582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #57: + 0x150582 (0x5651896b2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #57: + 0x150582 (0x55d1f2c6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55626f3b8142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: dist.recv( -[default3]:[rank35]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank42]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5580c86662b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55bf533568fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #17: + 0x5aee004 (0x7fb2c426c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a4d822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55626f3c3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #18: + 0x5af36b5 (0x7fb2c42716b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa8e3c6efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #57: + 0x150582 (0x5599a4d9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55bf5335df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa8e3c23371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d1f2c4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #54: + 0x211239 (0x55d581681239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #26: PyObject_Call + 0xbc (0x55626f3cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5651896978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #55: PyObject_Call + 0x207 (0x55d5815c1067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55bf5336fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5580c8673a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: frame #59: + 0x150582 (0x55d1f2c6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #38: + 0x211239 (0x55bf53432239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa8e3c23371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa8e3c23371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5599a4d808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5580c866c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55626f3b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #59: + 0x150582 (0x5651896b2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #59: + 0x150582 (0x5599a4d9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d5815a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55bf5335ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #60: PyObject_Call + 0xbc (0x5651896b2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #60: PyObject_Call + 0xbc (0x55d1f2c6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5580c867dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa8e3c23371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1f2c512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #57: + 0x150582 (0x55d5815c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55626f3c3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa8ab430189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d5815a58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55bf5335a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5651896992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #60: PyObject_Call + 0xbc (0x5599a4d9bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55bf53365a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55626f3b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #54: + 0x211239 (0x5580c8740239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: return func(*args, **kwargs) -[default0]:[rank32]: frame #62: + 0x150582 (0x55d1f2c6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #59: + 0x150582 (0x55d5815c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #30: + 0x150582 (0x55626f3cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #19: + 0xd2631e (0x7fb2d6e5b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5599a4d822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #62: + 0x150582 (0x5599a4d9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #63: PyObject_Call + 0xbc (0x55d1f2c6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #55: PyObject_Call + 0x207 (0x5580c8680067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: frame #60: PyObject_Call + 0xbc (0x55d5815c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55626f3b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa8ab437610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank41]: frame #32: + 0x150582 (0x55626f3cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa8ab456978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #63: PyObject_Call + 0xbc (0x5599a4d9bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55bf53355c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #20: + 0x47def4 (0x7fb2d65b2ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d5815a72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5580c86662b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #21: + 0x1445a6 (0x55d12692a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55bf53365a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #12: + 0x5adc309 (0x7fa8e3c15309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #62: + 0x150582 (0x55d5815c0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #63: PyObject_Call + 0xbc (0x55d5815c0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55626f3b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #13: + 0x5ae6f10 (0x7fa8e3c1ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55bf533568fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #34: + 0x150582 (0x55626f3cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #62: + 0x150582 (0x5651896b2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #57: + 0x150582 (0x5580c867f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: frame #63: PyObject_Call + 0xbc (0x5651896b2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55626f3b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #45: + 0x150582 (0x55bf53371582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #14: + 0x5ae6fa5 (0x7fa8e3c1ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55626f3bbf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55626f3cdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #15: + 0x5124446 (0x7fa8e325d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #46: PyObject_Call + 0xbc (0x55bf53371f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf533582b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: frame #38: + 0x211239 (0x55626f490239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5580c86648fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #48: + 0x150582 (0x55bf53371582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55626f3bca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #16: + 0x1acf4b8 (0x7fa8dfc084b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #49: PyObject_Call + 0xbc (0x55bf53371f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #17: + 0x5aee004 (0x7fa8e3c27004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55626f3b83e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8bb9e5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf533582b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #1: + 0x5b3a23e (0x7fd8f550223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #59: + 0x150582 (0x5580c867f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55626f3c3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55626f3b3c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank22]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55bf53365a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #18: + 0x5af36b5 (0x7fa8e3c2c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #60: PyObject_Call + 0xbc (0x5580c867ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #19: + 0xd2631e (0x7fa8f681631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank19]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd8f54fcc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d126923a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55bf5335e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #20: + 0x47def4 (0x7fa8f5f6def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5580c86662b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #21: + 0x1445a6 (0x5556094e35a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55bf5336fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #23: + 0x150866 (0x55d126936866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #62: + 0x150582 (0x5580c867f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5556094dca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #23: + 0x150866 (0x5556094ef866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55626f3c3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #54: + 0x211239 (0x55bf53432239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #55: PyObject_Call + 0x207 (0x55bf53372067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d12691f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #63: PyObject_Call + 0xbc (0x5580c867ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5556094d8142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5556094e3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55626f3b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd8f54fcf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank19]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd8f54fdfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #45: + 0x150582 (0x55626f3cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d12692aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf533582b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd8f54b2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #46: PyObject_Call + 0xbc (0x55626f3cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #26: PyObject_Call + 0xbc (0x55d126936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #26: PyObject_Call + 0xbc (0x5556094eff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55626f3b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #57: + 0x150582 (0x55bf53371582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5556094d62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #48: + 0x150582 (0x55626f3cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d12691d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd8f54b2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5556094e3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55bf533568fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #59: + 0x150582 (0x55bf53371582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5556094d48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #60: PyObject_Call + 0xbc (0x55bf53371f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf533582b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd8f54b2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #62: + 0x150582 (0x55bf53371582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #49: PyObject_Call + 0xbc (0x55626f3cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55626f3b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55626f3c3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd8f54b2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55626f3bc007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55626f3cdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd8bccbf189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #54: + 0x211239 (0x55626f490239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d12692aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d12691b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #63: PyObject_Call + 0xbc (0x55bf53371f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd8bccc6610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank19]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd8bcce5978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #55: PyObject_Call + 0x207 (0x55626f3d0067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #12: + 0x5adc309 (0x7fd8f54a4309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #30: + 0x150582 (0x55d126936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank19]: frame #13: + 0x5ae6f10 (0x7fd8f54aef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55626f3b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #57: + 0x150582 (0x55626f3cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55626f3b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d12691b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #59: + 0x150582 (0x55626f3cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #60: PyObject_Call + 0xbc (0x55626f3cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55626f3b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #32: + 0x150582 (0x55d126936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d12691b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #62: + 0x150582 (0x55626f3cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #63: PyObject_Call + 0xbc (0x55626f3cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank16]: frame #34: + 0x150582 (0x55d126936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d12691b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #30: + 0x150582 (0x5556094ef582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5556094d48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #32: + 0x150582 (0x5556094ef582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #14: + 0x5ae6fa5 (0x7fd8f54aefa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5556094d48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #34: + 0x150582 (0x5556094ef582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d126922f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #15: + 0x5124446 (0x7fd8f4aec446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d126934c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5556094d48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5556094dbf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #38: + 0x211239 (0x55d1269f7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #16: + 0x1acf4b8 (0x7fd8f14974b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d126923a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #17: + 0x5aee004 (0x7fd8f54b6004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank16]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d12691f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5556094edc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #38: + 0x211239 (0x5556095b0239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d12692aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5556094dca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5556094d83e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #18: + 0x5af36b5 (0x7fd8f54bb6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank23]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5556094e3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5556094d3c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d12691ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #19: + 0xd2631e (0x7fd9080a531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank19]: frame #20: + 0x47def4 (0x7fd9077fcef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank19]: frame #21: + 0x1445a6 (0x5561e41135a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d12692aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5561e410ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #23: + 0x150866 (0x5561e411f866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5556094e3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5556094d48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d12691b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #45: + 0x150582 (0x55d126936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #45: + 0x150582 (0x5556094ef582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5561e4108142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5561e4113a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #46: PyObject_Call + 0xbc (0x5556094eff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #46: PyObject_Call + 0xbc (0x55d126936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5556094d62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #26: PyObject_Call + 0xbc (0x5561e411ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #48: + 0x150582 (0x5556094ef582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #49: PyObject_Call + 0xbc (0x5556094eff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d12691d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5556094d62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5561e41062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5556094e3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #48: + 0x150582 (0x55d126936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #49: PyObject_Call + 0xbc (0x55d126936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5561e4113a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d12691d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5556094dc007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5556094edc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5561e41048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d12692aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #54: + 0x211239 (0x5556095b0239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #55: PyObject_Call + 0x207 (0x5556094f0067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d126923007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d126934c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #54: + 0x211239 (0x55d1269f7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #55: PyObject_Call + 0x207 (0x55d126937067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5556094d62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #57: + 0x150582 (0x5556094ef582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d12691d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5556094d48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #59: + 0x150582 (0x5556094ef582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #30: + 0x150582 (0x5561e411f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #60: PyObject_Call + 0xbc (0x5556094eff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5556094d62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #57: + 0x150582 (0x55d126936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d12691b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5561e41048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #62: + 0x150582 (0x5556094ef582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: frame #63: PyObject_Call + 0xbc (0x5556094eff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank23]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank19]: frame #32: + 0x150582 (0x5561e411f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #59: + 0x150582 (0x55d126936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #60: PyObject_Call + 0xbc (0x55d126936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d12691d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #62: + 0x150582 (0x55d126936582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: frame #63: PyObject_Call + 0xbc (0x55d126936f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5561e41048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank16]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank19]: frame #34: + 0x150582 (0x5561e411f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5561e41048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5561e410bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5561e411dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #38: + 0x211239 (0x5561e41e0239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5561e410ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5561e41083e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5561e4113a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5561e4103c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5561e4113a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5561e41048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #45: + 0x150582 (0x5561e411f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #46: PyObject_Call + 0xbc (0x5561e411ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5561e41062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #48: + 0x150582 (0x5561e411f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #49: PyObject_Call + 0xbc (0x5561e411ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5561e41062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5561e4113a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5561e410c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5561e411dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #54: + 0x211239 (0x5561e41e0239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #55: PyObject_Call + 0x207 (0x5561e4120067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5561e41062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #57: + 0x150582 (0x5561e411f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5561e41048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #59: + 0x150582 (0x5561e411f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #60: PyObject_Call + 0xbc (0x5561e411ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5561e41062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #62: + 0x150582 (0x5561e411f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: frame #63: PyObject_Call + 0xbc (0x5561e411ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank19]: . This may indicate a possible application crash on rank 0 or a network set up issue. -W0703 03:18:54.302000 139993479001920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 683396 closing signal SIGTERM -W0703 03:18:54.303000 139993479001920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 683398 closing signal SIGTERM -W0703 03:18:54.303000 139993479001920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 683399 closing signal SIGTERM -W0703 03:18:54.303000 139993479001920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 683400 closing signal SIGTERM -E0703 03:18:54.426000 140665481819968 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 887276) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-103.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 887277) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-103.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 887278) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-103.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 887279) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-103.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 887280) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-103.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 887281) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-103.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 887282) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-103.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 887283) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-103.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 887276) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-103: task 1: Exited with exit code 1 -E0703 03:18:55.125000 139993479001920 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 683394) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-138.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 683395) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-138.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 683397) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-138.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 683401) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:18:54 - host : ip-26-0-161-138.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 683394) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-138: task 2: Exited with exit code 1 -W0703 03:18:59.307000 140703858952000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1162957 closing signal SIGTERM -W0703 03:18:59.308000 140703858952000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1162960 closing signal SIGTERM -W0703 03:18:59.311000 140661720811328 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3783390 closing signal SIGTERM -W0703 03:18:59.311000 140661720811328 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3783392 closing signal SIGTERM -W0703 03:18:59.311000 140661720811328 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3783396 closing signal SIGTERM -E0703 03:18:59.431000 140672604792640 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 21938) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:18:59.433000 139967333427008 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1438906) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:18:59 - host : ip-26-0-166-125.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 21939) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:18:59 - host : ip-26-0-166-125.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 21940) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:18:59 - host : ip-26-0-166-125.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 21941) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:18:59 - host : ip-26-0-166-125.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 21942) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:18:59 - host : ip-26-0-166-125.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 21943) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:18:59 - host : ip-26-0-166-125.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 21944) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:18:59 - host : ip-26-0-166-125.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 21945) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:18:59 - host : ip-26-0-166-125.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 21938) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-153.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 1438907) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-153.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 1438908) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-153.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 1438909) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 1438910) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-153.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 1438911) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-153.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 1438912) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-153.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 1438913) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 1438906) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 03:18:59.632000 140703858952000 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1162955) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1162956) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1162958) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1162959) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-78.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1162961) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1162962) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:18:59 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1162955) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 03:18:59.732000 140661720811328 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 3783391) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -srun: error: ip-26-0-166-125: task 4: Exited with exit code 1 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:18:59 - host : ip-26-0-171-102.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 3783393) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:18:59 - host : ip-26-0-171-102.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 3783394) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:18:59 - host : ip-26-0-171-102.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 3783395) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:18:59 - host : ip-26-0-171-102.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 3783397) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:18:59 - host : ip-26-0-171-102.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 3783391) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 0: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -W0703 03:19:03.976000 140593047774976 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3912329_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:19:04.268000 139844849026816 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_900772_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:19:04.307000 139850509760320 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900845 closing signal SIGTERM -W0703 03:19:04.307000 139850509760320 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900846 closing signal SIGTERM -W0703 03:19:04.308000 139850509760320 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900847 closing signal SIGTERM -W0703 03:19:04.308000 139850509760320 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900848 closing signal SIGTERM -W0703 03:19:04.309000 139850509760320 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900849 closing signal SIGTERM -W0703 03:19:04.310000 139850509760320 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900850 closing signal SIGTERM -W0703 03:19:04.310000 139850509760320 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900851 closing signal SIGTERM -W0703 03:19:04.310000 139850509760320 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900852 closing signal SIGTERM -E0703 03:19:04.441000 140598708508480 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3912402) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:19:04.448000 140598708508480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3912329_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:19:04.476000 140598708508480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3912329_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:19:04.504000 140598708508480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3912329_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:19:04 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 3912403) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:19:04 - host : ip-26-0-171-62.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 3912404) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:19:04 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 3912405) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:19:04 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 3912406) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:19:04 - host : ip-26-0-171-62.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 3912407) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:19:04 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 3912408) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:19:04 - host : ip-26-0-171-62.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 3912409) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:19:04 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 3912402) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -W0703 03:19:09.272000 139844849026816 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_900772_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:19:12.657000 139850509760320 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_900772_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:19:12.668000 139850509760320 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_900772_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/status.txt b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-256/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/bench.slurm deleted file mode 100644 index 35e915a9eba52f10f641024edcbba9646a263946..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/config.yaml b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/config.yaml deleted file mode 100644 index cd25f84115adf2878079d3f73aba5618776c180b..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 4 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 32 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 32 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/log.out b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/log.out deleted file mode 100644 index 6e150da94e40dcbfecac4c3f2f2aed318bb4fbc3..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/log.out +++ /dev/null @@ -1,5643 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:00:17 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:00:20.365000 139924922677056 torch/distributed/run.py:757] -W0703 03:00:20.365000 139924922677056 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.365000 139924922677056 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:00:20.365000 139924922677056 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.366000 139894981277504 torch/distributed/run.py:757] -W0703 03:00:20.366000 139894981277504 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.366000 139894981277504 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:00:20.366000 139894981277504 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.367000 140088873154368 torch/distributed/run.py:757] -W0703 03:00:20.367000 140088873154368 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.367000 140088873154368 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:00:20.367000 140088873154368 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.397000 140111955285824 torch/distributed/run.py:757] -W0703 03:00:20.397000 140111955285824 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.397000 140111955285824 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:00:20.397000 140111955285824 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.403000 139742997817152 torch/distributed/run.py:757] -W0703 03:00:20.403000 139742997817152 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.403000 139742997817152 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:00:20.403000 139742997817152 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.406000 140345940756288 torch/distributed/run.py:757] -W0703 03:00:20.406000 140345940756288 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.406000 140345940756288 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:00:20.406000 140345940756288 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.418000 140713406908224 torch/distributed/run.py:757] -W0703 03:00:20.418000 140713406908224 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.418000 140713406908224 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:00:20.418000 140713406908224 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.490000 140668047779648 torch/distributed/run.py:757] -W0703 03:00:20.490000 140668047779648 torch/distributed/run.py:757] ***************************************** -W0703 03:00:20.490000 140668047779648 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:00:20.490000 140668047779648 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:00:41 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config: -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: run='%date_%jobid', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: step=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: consumed_train_samples=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: benchmark_csv_path=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp=4, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp=16, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp_engine=, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_mode=, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: expert_parallel_size=1), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_revision=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_max_length=None), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoint_interval=100000, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: save_initial_state=False, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: log_level_replica='info', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: train_steps=20, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: micro_batch_size=32, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: batch_accumulation_per_replica=32, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: val_check_interval=-1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_val_batches=0, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_test_batches=0), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta1=0.9, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta2=0.95, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: name='adamW'), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: zero_stage=1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: weight_decay=0.01, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: clip_grad=1.0, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_steps=1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_style='linear', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_style='linear', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_steps=19, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: start_training_step=1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_splits='train', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: text_column_name='text'), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_loading_workers=0))], -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32')), -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lighteval=None) -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Model Config: -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272) -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Building model.. -[default0]:07/03/2024 03:00:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Setting PP block ranks... -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-247]: No checkpoint path provided. -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-247]: No checkpoint path provided. -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-247]: No checkpoint path provided. -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=5|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=5|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=5|ip-26-0-173-246]: No checkpoint path provided. -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=6|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=6|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-247]: No checkpoint path provided. -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-247]: No checkpoint path provided. -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=6|ip-26-0-173-246]: No checkpoint path provided. -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-247]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-247]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=0|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=0|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=0|ip-26-0-173-246]: No checkpoint path provided. -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=7|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=7|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=7|ip-26-0-173-246]: No checkpoint path provided. -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-247]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-247]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-247]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: No checkpoint path provided. -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=1|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=1|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=1|ip-26-0-169-139]: No checkpoint path provided. -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=4|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=4|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=4|ip-26-0-169-139]: No checkpoint path provided. -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=2|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=2|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=2|ip-26-0-169-139]: No checkpoint path provided. -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=7|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=7|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=7|ip-26-0-169-139]: No checkpoint path provided. -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=2|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=3|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=3|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=3|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=2|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=2|ip-26-0-173-246]: No checkpoint path provided. -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=1|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=1|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=1|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=13|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=13|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=13|ip-26-0-174-36]: No checkpoint path provided. -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=9|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=9|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=9|ip-26-0-174-36]: No checkpoint path provided. -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=0|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=0|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=0|ip-26-0-169-139]: No checkpoint path provided. -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=3|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=3|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=12|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=12|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=3|ip-26-0-169-139]: No checkpoint path provided. -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=6|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=6|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=6|ip-26-0-169-139]: No checkpoint path provided. -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=12|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=8|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=8|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=8|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Total number of parameters: 1.21G (2315.81MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=11|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=11|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=11|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Parametrizing model parameters using StandardParametrizator -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=10|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=10|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=10|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=14|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=14|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=14|ip-26-0-174-36]: No checkpoint path provided. -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=4|ip-26-0-173-246]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=4|ip-26-0-173-246]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=4|ip-26-0-173-246]: No checkpoint path provided. -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: No checkpoint path provided. -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: No checkpoint path provided. -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=5|ip-26-0-169-139]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=5|ip-26-0-169-139]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:00:59 [INFO|DP=0|PP=2|TP=5|ip-26-0-169-139]: No checkpoint path provided. -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=15|ip-26-0-174-36]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=15|ip-26-0-174-36]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=3|TP=15|ip-26-0-174-36]: No checkpoint path provided. -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 03:00:59 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 03:01:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:01:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:01:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 0 has 24.8M out of 24.8M (100.00%) params' optimizer states -[default0]:07/03/2024 03:01:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:01:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Using `datasets` library -[default0]:07/03/2024 03:01:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:01:01 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:01:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:01:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:01:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: -[default0]:07/03/2024 03:01:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Start training] datetime: 2024-07-03 03:01:02.473744 | mbs: 32 | grad_accum: 32 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:01:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:01:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 244.38MiB. Peak allocated 244.38MiB. Peak reserved: 266.00MiB -[default6]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=9|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=7|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=0|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=13|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=8|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=7|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=4|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=2|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=2|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=9|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=12|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=8|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=4|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=1|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=15|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=6|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=10|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=5|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=3|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=13|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=0|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=10|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=5|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=6|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=3|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=0|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=14|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=12|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=1|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:01:02 [WARNING|DP=0|PP=0|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=3|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=6|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=11|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=2|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=5|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=15|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=7|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:01:02 [WARNING|DP=0|PP=2|TP=11|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:01:02 [WARNING|DP=0|PP=1|TP=4|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:01:02 [WARNING|DP=0|PP=3|TP=14|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:01:03 [WARNING|DP=0|PP=0|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:01:07 [WARNING|DP=0|PP=3|TP=1|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank11]: send_activation() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank11]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank11]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank11]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank11]: dist.send( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank11]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank22]: send_activation() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank22]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank22]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank22]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank22]: dist.send( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank22]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank7]: send_activation() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank7]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank7]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank7]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank7]: dist.send( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank7]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank1]: send_activation() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank1]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank1]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank1]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank1]: dist.send( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank1]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank20]: send_activation() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank20]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank20]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank20]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank20]: dist.send( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank20]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank24]: send_activation() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank24]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank24]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank24]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default0]:[rank24]: dist.send( -[default6]:[rank30]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: return func(*args, **kwargs) -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank24]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank26]: send_activation() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank26]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank26]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank26]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank26]: dist.send( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank26]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank30]: send_activation() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank30]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank30]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank30]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank30]: dist.send( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank30]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank27]: send_activation() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank27]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank27]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank27]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank27]: dist.send( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank27]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank23]: send_activation() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank23]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank23]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank23]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank23]: dist.send( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank23]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank14]: send_activation() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank14]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank14]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank14]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank14]: dist.send( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank14]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank12]: send_activation() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank12]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank12]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank12]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank12]: dist.send( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank12]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank0]: send_activation() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank0]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank0]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank0]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank0]: dist.send( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank0]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank8]: send_activation() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank8]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank8]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank8]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank8]: dist.send( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank8]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f181a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f1947dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f19482a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f19483dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2f64f1ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2f69f63609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2f69d2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f181a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f1947dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f19482a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f19483dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2f64f1ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2f69f63609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2f69d2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f181a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f2f19107119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f2f64f1ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f2f69f63609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f2f69d2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank2]: send_activation() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank2]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank2]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank2]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank2]: dist.send( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank2]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank15]: send_activation() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank15]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank15]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank15]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank15]: dist.send( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank15]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9f1d9a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc9f3073c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc9f3078a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc9f3079dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fca3eb12e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fca43b59609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fca43924353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9f1d9a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc9f3073c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc9f3078a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc9f3079dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fca3eb12e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fca43b59609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fca43924353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9f1d9a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fc9f2cfd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fca3eb12e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fca43b59609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fca43924353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d447c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5d45a9ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5d45a9fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5d45aa0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5d91539e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5d96580609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5d9634b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d447c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5d45a9ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5d45a9fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5d45aa0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5d91539e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5d96580609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5d9634b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d447c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f5d45724119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f5d91539e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f5d96580609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f5d9634b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank3]: send_activation() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank3]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank3]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank3]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank3]: dist.send( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank3]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank9]: send_activation() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank9]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank9]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank9]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank9]: dist.send( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank9]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank10]: send_activation() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank10]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank10]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank10]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank10]: dist.send( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank10]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank6]: send_activation() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank6]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank6]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank6]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank6]: dist.send( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank6]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank31]: send_activation() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank31]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank31]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank31]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank31]: dist.send( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank31]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank5]: send_activation() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank5]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank5]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank5]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank5]: dist.send( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank5]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f312ba7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f312cd53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f312cd58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f312cd59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f31787f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f317d839609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f317d604353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f312ba7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f312cd53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f312cd58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f312cd59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f31787f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f317d839609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f317d604353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f312ba7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f312c9dd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f31787f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f317d839609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f317d604353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank28]: send_activation() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank28]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank28]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank28]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank28]: dist.send( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank28]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank25]: send_activation() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank25]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank25]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank25]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank25]: dist.send( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank25]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank4]: send_activation() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank4]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank4]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank4]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank4]: dist.send( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank4]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank13]: send_activation() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank13]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank13]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank13]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank13]: dist.send( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank13]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f574475a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5745a33c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5745a38a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5745a39dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f57914d2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5796519609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f57962e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f574475a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5745a33c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5745a38a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5745a39dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f57914d2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5796519609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f57962e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f574475a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f57456bd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f57914d2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f5796519609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f57962e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank29]: send_activation() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank29]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank29]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank29]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank29]: dist.send( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank29]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff611552897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff61282bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff612830a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff612831dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff65e2cae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff663311609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff6630dc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff611552897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff61282bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff612830a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff612831dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff65e2cae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff663311609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff6630dc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff611552897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7ff6124b5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7ff65e2cae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7ff663311609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7ff6630dc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank19]: send_activation() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank19]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank19]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank19]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank19]: dist.send( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank19]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank18]: send_activation() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank18]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank18]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank18]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank18]: dist.send( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank18]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank17]: send_activation() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank17]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank17]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank17]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank17]: dist.send( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank17]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank21]: send_activation() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank21]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank21]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank21]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank21]: dist.send( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank21]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank16]: send_activation() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank16]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank16]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank16]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank16]: dist.send( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank16]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9fff782897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa000a5bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa000a60a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa000a61dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa04c4fae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa051541609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa05130c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9fff782897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa000a5bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa000a60a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa000a61dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa04c4fae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa051541609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa05130c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9fff782897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fa0006e5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fa04c4fae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fa051541609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fa05130c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5687030897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5688309c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f568830ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f568830fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f56d3da8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f56d8def609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f56d8bba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5687030897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5688309c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f568830ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f568830fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f56d3da8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f56d8def609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f56d8bba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5687030897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f5687f93119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f56d3da8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f56d8def609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f56d8bba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70534f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70547cdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f70547d2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70547d3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f70a026ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f70a52b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f70a507e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70534f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70547cdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f70547d2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70547d3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f70a026ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f70a52b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f70a507e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70534f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f7054457119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f70a026ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f70a52b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f70a507e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9bf4394897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9bf566dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9bf5672a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9bf5673dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9c4110ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9c46153609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9c45f1e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9bf4394897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9bf566dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9bf5672a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9bf5673dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9c4110ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9c46153609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9c45f1e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9bf4394897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f9bf52f7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f9c4110ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f9c46153609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f9c45f1e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59df6c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59e099bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59e09a0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59e09a1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5a2c43ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5a31481609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5a3124c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59df6c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59e099bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59e09a0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59e09a1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5a2c43ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5a31481609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5a3124c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59df6c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f59e0625119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f5a2c43ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f5a31481609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f5a3124c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0d77814897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0d78aedc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0d78af2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0d78af3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0dc458ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0dc95d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0dc939e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0d77814897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0d78aedc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0d78af2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0d78af3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0dc458ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0dc95d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:frame #6: clone + 0x43 (0x7f0dc939e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0d77814897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f0d78777119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:frame #2: + 0xd3e95 (0x7f0dc458ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f0dc95d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:frame #4: clone + 0x43 (0x7f0dc939e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6f464b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6f47791c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6f47796a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6f47797dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6f93230e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6f98277609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6f98042353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6f464b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6f47791c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6f47796a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6f47797dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6f93230e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6f98277609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6f98042353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6f464b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f6f4741b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f6f93230e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f6f98277609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f6f98042353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f36ea2b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f36eb591c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f36eb596a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f36eb597dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3737030e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f373c077609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f373be42353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f36ea2b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f36eb591c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f36eb596a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f36eb597dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3737030e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f373c077609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f373be42353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f36ea2b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f36eb21b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f3737030e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f373c077609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f373be42353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8dc76be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8dc8997c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8dc899ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8dc899ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8e14436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8e1947d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8e19248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8dc76be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8dc8997c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8dc899ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8dc899ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8e14436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8e1947d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8e19248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8dc76be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f8dc8621119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f8e14436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f8e1947d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f8e19248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6117fb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f611928bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6119290a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6119291dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6164d2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f6169d71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6169b3c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6117fb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f611928bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6119290a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6119291dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6164d2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f6169d71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6169b3c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6117fb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f6118f15119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f6164d2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f6169d71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f6169b3c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank40]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank40]: grad_accumulator.backward(sum(activations)) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank40]: result = loss.backward() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank40]: torch.autograd.backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank40]: _engine_run_backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank40]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank40]: return user_fn(self, *args) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank40]: self.grads_buffer.append(recv_grad()) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank34]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank34]: grad_accumulator.backward(sum(activations)) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank34]: result = loss.backward() -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank34]: torch.autograd.backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank34]: _engine_run_backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank34]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank34]: return user_fn(self, *args) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank34]: self.grads_buffer.append(recv_grad()) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank33]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank33]: grad_accumulator.backward(sum(activations)) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank33]: result = loss.backward() -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank33]: torch.autograd.backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank33]: _engine_run_backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank33]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank33]: return user_fn(self, *args) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank33]: self.grads_buffer.append(recv_grad()) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank32]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank32]: grad_accumulator.backward(sum(activations)) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank32]: result = loss.backward() -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank32]: torch.autograd.backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank32]: _engine_run_backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank32]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank32]: return user_fn(self, *args) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank32]: self.grads_buffer.append(recv_grad()) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank41]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank41]: grad_accumulator.backward(sum(activations)) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank41]: result = loss.backward() -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank41]: torch.autograd.backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank41]: _engine_run_backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank41]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank41]: return user_fn(self, *args) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank41]: self.grads_buffer.append(recv_grad()) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank35]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank35]: grad_accumulator.backward(sum(activations)) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank35]: result = loss.backward() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank35]: torch.autograd.backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank35]: _engine_run_backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank35]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank35]: return user_fn(self, *args) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank35]: self.grads_buffer.append(recv_grad()) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank44]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank44]: grad_accumulator.backward(sum(activations)) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank44]: result = loss.backward() -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank44]: torch.autograd.backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank44]: _engine_run_backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank44]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank44]: return user_fn(self, *args) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank44]: self.grads_buffer.append(recv_grad()) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank47]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank47]: grad_accumulator.backward(sum(activations)) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank47]: result = loss.backward() -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank47]: torch.autograd.backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank47]: _engine_run_backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank47]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank47]: return user_fn(self, *args) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank47]: self.grads_buffer.append(recv_grad()) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank46]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank46]: grad_accumulator.backward(sum(activations)) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank46]: result = loss.backward() -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank46]: torch.autograd.backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank46]: _engine_run_backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank46]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank46]: return user_fn(self, *args) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank46]: self.grads_buffer.append(recv_grad()) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76815e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f76828bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f76828c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f76828c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f76ce35ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f76d33a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f76d316e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76815e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f76828bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f76828c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f76828c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f76ce35ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f76d33a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f76d316e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76815e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f7682547119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f76ce35ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f76d33a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f76d316e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f370655f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3707838c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f370783da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f370783edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f37532d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f375831e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f37580e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f370655f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3707838c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f370783da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f370783edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f37532d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f375831e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f37580e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f370655f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f37074c2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f37532d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f375831e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f37580e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14a306d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f14a4346c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f14a434ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f14a434cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f14efde5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f443bb87897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f443ce60c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f443ce65a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f443ce66dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f44888ffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f448d946609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f448d711353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f443bb87897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f443ce60c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f443ce65a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f443ce66dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f44888ffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f448d946609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #5: + 0x8609 (0x7f14f4e2c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f14f4bf7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default6]:frame #6: clone + 0x43 (0x7f448d711353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f443bb87897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f443caea119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f44888ffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f448d946609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f448d711353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14a306d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f14a4346c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f14a434ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f14a434cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f14efde5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f14f4e2c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f14f4bf7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14a306d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f14a3fd0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f14efde5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f14f4e2c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f14f4bf7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank37]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank37]: grad_accumulator.backward(sum(activations)) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank37]: result = loss.backward() -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank37]: torch.autograd.backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank37]: _engine_run_backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank37]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank37]: return user_fn(self, *args) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank37]: self.grads_buffer.append(recv_grad()) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank45]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank45]: grad_accumulator.backward(sum(activations)) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank45]: result = loss.backward() -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank45]: torch.autograd.backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank45]: _engine_run_backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank45]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank45]: return user_fn(self, *args) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank45]: self.grads_buffer.append(recv_grad()) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc68677a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc687a53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc687a58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc687a59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fc6d34f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fc6d8539609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fc6d8304353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc68677a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc687a53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc687a58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc687a59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fc6d34f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fc6d8539609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fc6d8304353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc68677a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fc6876dd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fc6d34f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fc6d8539609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fc6d8304353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe2470a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe248381c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe248386a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe248387dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe293e20e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe298e67609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe298c32353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe2470a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe248381c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe248386a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe248387dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe293e20e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe298e67609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe298c32353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe2470a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fe24800b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fe293e20e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fe298e67609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fe298c32353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8ae24a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8af523c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8af528a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8af529dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa8fafc2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa900009609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa8ffdd4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8ae24a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8af523c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8af528a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8af529dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa8fafc2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa900009609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa8ffdd4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8ae24a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fa8af1ad119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fa8fafc2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fa900009609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fa8ffdd4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank36]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank36]: grad_accumulator.backward(sum(activations)) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank36]: result = loss.backward() -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank36]: torch.autograd.backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank36]: _engine_run_backward( -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank36]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank36]: return user_fn(self, *args) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: self.grads_buffer.append(recv_grad()) -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank38]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank38]: grad_accumulator.backward(sum(activations)) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank38]: result = loss.backward() -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: torch.autograd.backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default6]:[rank38]: _engine_run_backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank38]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank38]: return user_fn(self, *args) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank38]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank38]: self.grads_buffer.append(recv_grad()) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa079bc5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa07ae9ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa07aea3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa07aea4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa0c693de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa0cb984609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa0cb74f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa079bc5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa07ae9ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa07aea3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa07aea4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa0c693de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa0cb984609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa0cb74f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa079bc5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fa07ab28119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fa0c693de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fa0cb984609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fa0cb74f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank39]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank39]: grad_accumulator.backward(sum(activations)) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank39]: result = loss.backward() -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank39]: torch.autograd.backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank39]: _engine_run_backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank39]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank39]: return user_fn(self, *args) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank39]: self.grads_buffer.append(recv_grad()) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f86c287f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f86c3b58c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f86c3b5da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f86c3b5edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f870f5f7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f871463e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e84c43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3e85f1cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #6: clone + 0x43 (0x7f8714409353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3e85f21a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3e85f22dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3ed19bbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3ed6a02609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3ed67cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]: -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f86c287f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e84c43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f86c3b58c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f86c3b5da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f86c3b5edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f870f5f7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f871463e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3e85f1cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3e85f21a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3e85f22dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #6: clone + 0x43 (0x7f8714409353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #4: + 0xd3e95 (0x7f3ed19bbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3ed6a02609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3ed67cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f86c287f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f86c37e2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f870f5f7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f871463e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f8714409353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e84c43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f3e85ba6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f3ed19bbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f3ed6a02609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f3ed67cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank43]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank43]: grad_accumulator.backward(sum(activations)) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank43]: result = loss.backward() -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank43]: torch.autograd.backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank43]: _engine_run_backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank43]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank43]: return user_fn(self, *args) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank43]: self.grads_buffer.append(recv_grad()) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab4097d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab41c56c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab41c5ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab41c5cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fab8d6f5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b12506897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #5: + 0x8609 (0x7fab9273c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b137dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b137e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b137e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2b5f27ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f2b642c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f2b64090353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #6: clone + 0x43 (0x7fab92507353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab4097d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b12506897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b137dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b137e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab41c56c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b137e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2b5f27ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f2b642c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f2b64090353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab41c5ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab41c5cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fab8d6f5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b12506897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f2b13469119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f2b5f27ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fab9273c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #3: + 0x8609 (0x7f2b642c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fab92507353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:frame #4: clone + 0x43 (0x7f2b64090353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab4097d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fab418e0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fab8d6f5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fab9273c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fab92507353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]: -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank42]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank42]: grad_accumulator.backward(sum(activations)) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank42]: result = loss.backward() -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank42]: torch.autograd.backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank42]: _engine_run_backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank42]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank42]: return user_fn(self, *args) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank42]: self.grads_buffer.append(recv_grad()) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b07915897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b08beec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b08bf3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b08bf4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1b5468de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1b596d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1b5949f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b07915897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b08beec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b08bf3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b08bf4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1b5468de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1b596d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1b5949f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b07915897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f1b08878119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f1b5468de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f1b596d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f1b5949f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8687e8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8689167c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f868916ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f868916ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f86d4c06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f86d9c4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f86d9a18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8687e8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8689167c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f868916ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f868916ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f86d4c06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f86d9c4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f86d9a18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8687e8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f8688df1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f86d4c06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f86d9c4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f86d9a18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f09e8eaa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f09ea183c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f09ea188a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f09ea189dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f0a35c22e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f0a3ac69609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f0a3aa34353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f09e8eaa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f09ea183c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f09ea188a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f09ea189dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f0a35c22e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f0a3ac69609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f0a3aa34353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f09e8eaa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f09e9e0d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f0a35c22e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f0a3ac69609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f0a3aa34353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fddf24a1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fddf377ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fddf377fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fddf3780dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fde3f219e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fde44260609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fde4402b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fddf24a1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fddf377ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fddf377fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fddf3780dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fde3f219e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fde44260609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fde4402b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fddf24a1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fddf3404119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fde3f219e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fde44260609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fde4402b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7408eb6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f740a18fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f740a194a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f740a195dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7455c2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f745ac75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f745aa40353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7408eb6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f740a18fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f740a194a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f740a195dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7455c2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f745ac75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f745aa40353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7408eb6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f7409e19119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f7455c2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f745ac75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f745aa40353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb451a0e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb452ce7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb452ceca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb452ceddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb49e786e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb4a37cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb4a3598353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb451a0e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb452ce7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb452ceca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb452ceddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb49e786e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb4a37cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb4a3598353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb451a0e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fb452971119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fb49e786e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fb4a37cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fb4a3598353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f53d3de5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f53d50bec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f53d50c3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f53d50c4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5420b5de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5425ba4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f542596f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f53d3de5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f53d50bec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f53d50c3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f53d50c4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5420b5de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5425ba4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f542596f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f53d3de5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f53d4d48119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f5420b5de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f5425ba4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f542596f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce1a0b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce1b390c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce1b395a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce1b396dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fce66e2fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fce6be76609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fce6bc41353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce1a0b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce1b390c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce1b395a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce1b396dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fce66e2fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fce6be76609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fce6bc41353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce1a0b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fce1b01a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fce66e2fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fce6be76609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fce6bc41353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6a0fa88897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6a10d61c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6a10d66a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6a10d67dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f6a5c800e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f6a61847609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f6a61612353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6a0fa88897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6a10d61c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6a10d66a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6a10d67dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f6a5c800e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f6a61847609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f6a61612353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6a0fa88897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f6a109eb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f6a5c800e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f6a61847609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f6a61612353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d19f11897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5d1b1eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5d1b1efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5d1b1f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5d66c89e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5d6bcd0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5d6ba9b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d19f11897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5d1b1eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5d1b1efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5d1b1f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5d66c89e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5d6bcd0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5d6ba9b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d19f11897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f5d1ae74119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f5d66c89e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f5d6bcd0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f5d6ba9b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fae3f831897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fae40b0ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fae40b0fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fae40b10dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fae8c5a9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fae915f0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fae913bb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fae3f831897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fae40b0ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fae40b0fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fae40b10dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fae8c5a9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fae915f0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fae913bb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fae3f831897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fae40794119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fae8c5a9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fae915f0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fae913bb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb86ebef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb86fec8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb86fecda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb86fecedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fb8bb967e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fb8c09ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb8c0779353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb86ebef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb86fec8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb86fecda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb86fecedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fb8bb967e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fb8c09ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb8c0779353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb86ebef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fb86fb52119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fb8bb967e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fb8c09ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fb8c0779353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f333a3a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f333b67bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f333b680a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f333b681dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f338711ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f338c161609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f338bf2c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f333a3a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f333b67bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f333b680a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f333b681dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f338711ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f338c161609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f338bf2c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f333a3a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f333b305119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f338711ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f338c161609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f338bf2c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00321cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f00334a5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f00334aaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f00334abdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f007ef44e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0083f8b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0083d56353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00321cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f00334a5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f00334aaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f00334abdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f007ef44e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0083f8b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0083d56353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00321cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f003312f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f007ef44e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f0083f8b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f0083d56353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd1db49a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd1dc773c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd1dc778a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd1dc779dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd228212e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd22d259609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd22d024353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd1db49a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd1dc773c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd1dc778a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd1dc779dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd228212e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd22d259609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd22d024353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd1db49a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fd1dc3fd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fd228212e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fd22d259609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fd22d024353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c9b1b9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c9c492c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c9c497a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c9c498dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f1ce7f31e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f1cecf78609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f1cecd43353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c9b1b9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c9c492c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c9c497a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c9c498dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f1ce7f31e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f1cecf78609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f1cecd43353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c9b1b9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f1c9c11c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f1ce7f31e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f1cecf78609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f1cecd43353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9882c35897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9883f0ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9883f13a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9883f14dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f98cf9ade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f98d49f4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f98d47bf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9882c35897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9883f0ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9883f13a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9883f14dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f98cf9ade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f98d49f4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f98d47bf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9882c35897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f9883b98119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f98cf9ade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f98d49f4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f98d47bf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd1e29c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd1f575c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd1f57aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd1f57bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fdd6b014e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fdd7005b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fdd6fe26353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd1e29c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd1f575c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd1f57aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd1f57bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fdd6b014e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fdd7005b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fdd6fe26353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd1e29c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fdd1f1ff119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fdd6b014e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fdd7005b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fdd6fe26353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef52402897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef536dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef536e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef536e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fef9f17ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fefa41c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fefa3f8c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef52402897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef536dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef536e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef536e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fef9f17ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fefa41c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fefa3f8c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef52402897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fef53365119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fef9f17ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fefa41c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fefa3f8c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f62018de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6202bb7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6202bbca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6202bbddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f624e656e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f625369d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6253468353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f62018de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6202bb7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6202bbca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6202bbddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f624e656e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f625369d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6253468353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f62018de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f6202841119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f624e656e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f625369d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f6253468353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f085a2c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f085b5a2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f085b5a7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f085b5a8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f08a7041e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f08ac088609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f08abe53353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f085a2c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f085b5a2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f085b5a7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f085b5a8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f08a7041e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f08ac088609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f08abe53353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f085a2c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f085b22c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f08a7041e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f08ac088609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f08abe53353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0a4c22b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0a4d504c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0a4d509a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0a4d50adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0a98fa3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0a9dfea609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0a9ddb5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0a4c22b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0a4d504c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0a4d509a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0a4d50adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0a98fa3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0a9dfea609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0a9ddb5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0a4c22b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f0a4d18e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f0a98fa3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f0a9dfea609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f0a9ddb5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa08b439897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa08c712c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa08c717a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa08c718dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa0d81b1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa0dd1f8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa0dcfc3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa08b439897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa08c712c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa08c717a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa08c718dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa0d81b1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa0dd1f8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa0dcfc3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa08b439897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fa08c39c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fa0d81b1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fa0dd1f8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fa0dcfc3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a9f85f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3aa0b38c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3aa0b3da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3aa0b3edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3aec5d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3af161e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3af13e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a9f85f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3aa0b38c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3aa0b3da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3aa0b3edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3aec5d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3af161e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3af13e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a9f85f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f3aa07c2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f3aec5d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f3af161e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f3af13e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb521b53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb522e2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb522e31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb522e32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb56e8cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb573912609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb5736dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f07930bf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb521b53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb522e2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0794398c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb522e31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb522e32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb56e8cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb573912609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb5736dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f079439da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f079439edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f07dfe37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb521b53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #5: + 0x8609 (0x7f07e4e7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f07e4c49353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #1: + 0xe32119 (0x7fb522ab6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fb56e8cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fb573912609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]: -[default2]:frame #4: clone + 0x43 (0x7fb5736dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f07930bf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0794398c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f079439da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f079439edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f07dfe37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f07e4e7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f07e4c49353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f07930bf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f0794022119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f07dfe37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f07e4e7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f07e4c49353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c355fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c368d4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c368d9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c368dadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4c82373e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4c873ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f4c87185353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c355fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c368d4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c368d9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c368dadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4c82373e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4c873ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f4c87185353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c355fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f4c3655e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f4c82373e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f4c873ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f4c87185353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe138bb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe139e8bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe139e90a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe139e91dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe18592ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe18a971609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe18a73c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe138bb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe139e8bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe139e90a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe139e91dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe18592ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe18a971609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe18a73c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe138bb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fe139b15119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fe18592ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fe18a971609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fe18a73c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f550e325897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f550f5fec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f550f603a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f550f604dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f555b09de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f55600e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f555feaf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f550e325897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f550f5fec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f550f603a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f550f604dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f555b09de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f55600e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f555feaf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f550e325897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f550f288119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f555b09de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f55600e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f555feaf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb137311897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1385eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1385efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1385f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb184089e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb1890d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb188e9b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb137311897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1385eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1385efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1385f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb184089e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb1890d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb188e9b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb137311897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fb138274119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fb184089e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fb1890d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fb188e9b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42af9f7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f42b0cd0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f42b0cd5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f42b0cd6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f42fc76fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f43017b6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f4301581353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42af9f7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f42b0cd0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f42b0cd5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f42b0cd6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f42fc76fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f43017b6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f4301581353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42af9f7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f42b095a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f42fc76fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f43017b6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f4301581353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb0be0e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb0bf3bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb0bf3c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff60b733897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb0bf3c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb10ae5be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb10fea2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb10fc6d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff60ca0cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff60ca11a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff60ca12dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff6584abe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb0be0e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #5: + 0x8609 (0x7ff65d4f2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff65d2bd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb0bf3bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]: -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb0bf3c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb0bf3c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb10ae5be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff60b733897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff60ca0cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7fb10fea2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb10fc6d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff60ca11a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb0be0e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff60ca12dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff6584abe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #1: + 0xe32119 (0x7fb0bf046119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb10ae5be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb10fea2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb10fc6d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #5: + 0x8609 (0x7ff65d4f2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff65d2bd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff60b733897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ff60c696119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff6584abe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff65d4f2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ff65d2bd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd8ba38897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd8cd11c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd8cd16a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd8cd17dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fcdd87b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fcddd7f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fcddd5c2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd8ba38897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd8cd11c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd8cd16a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd8cd17dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fcdd87b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fcddd7f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fcddd5c2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd8ba38897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fcd8c99b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fcdd87b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fcddd7f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fcddd5c2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ec8da2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0eca07bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0eca080a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0eca081dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f0f15b1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f0f1ab61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f0f1a92c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ec8da2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0eca07bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0eca080a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0eca081dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f0f15b1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f0f1ab61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f0f1a92c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ec8da2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f0ec9d05119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f0f15b1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f0f1ab61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f0f1a92c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2acda85897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2aced5ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2aced63a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2aced64dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2b1a7fde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2b1f844609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2b1f60f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2acda85897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2aced5ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2aced63a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2aced64dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2b1a7fde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2b1f844609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2b1f60f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2acda85897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f2ace9e8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f2b1a7fde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f2b1f844609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f2b1f60f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -W0703 03:11:23.254000 139742997817152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1658228 closing signal SIGTERM -W0703 03:11:23.254000 139742997817152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1658229 closing signal SIGTERM -W0703 03:11:23.254000 139742997817152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1658230 closing signal SIGTERM -W0703 03:11:23.254000 139742997817152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1658231 closing signal SIGTERM -W0703 03:11:23.254000 139742997817152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1658232 closing signal SIGTERM -W0703 03:11:23.254000 139742997817152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1658233 closing signal SIGTERM -W0703 03:11:23.255000 139742997817152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1658234 closing signal SIGTERM -E0703 03:11:25.582000 139742997817152 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1658227) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:11:23 - host : ip-26-0-162-233.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1658227) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1658227 -============================================================ -srun: error: ip-26-0-162-233: task 0: Exited with exit code 1 -W0703 03:11:27.090000 139889320544000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-174-36.ec2.internal_834480_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:27.178000 140662387046144 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_405377_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:27.930000 139919261943552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-147.ec2.internal_792184_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:27.945000 140340280022784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-246.ec2.internal_320517_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:27.946000 140707746174720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-139.ec2.internal_203179_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:27.965000 140106294552320 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_894787_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:28.006000 140083212420864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-247.ec2.internal_321809_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:28.233000 140088873154368 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 321879 closing signal SIGTERM -W0703 03:11:28.233000 140088873154368 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 321880 closing signal SIGTERM -W0703 03:11:28.233000 140088873154368 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 321881 closing signal SIGTERM -W0703 03:11:28.233000 140088873154368 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 321882 closing signal SIGTERM -W0703 03:11:28.233000 140088873154368 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 321883 closing signal SIGTERM -W0703 03:11:28.233000 140088873154368 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 321884 closing signal SIGTERM -W0703 03:11:28.233000 140088873154368 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 321885 closing signal SIGTERM -W0703 03:11:28.233000 140088873154368 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 321886 closing signal SIGTERM -W0703 03:11:28.252000 140668047779648 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 405452 closing signal SIGTERM -W0703 03:11:28.252000 140668047779648 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 405453 closing signal SIGTERM -W0703 03:11:28.252000 140668047779648 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 405454 closing signal SIGTERM -W0703 03:11:28.252000 140668047779648 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 405456 closing signal SIGTERM -W0703 03:11:28.292000 140111955285824 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 894862 closing signal SIGTERM -W0703 03:11:28.292000 140345940756288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320587 closing signal SIGTERM -W0703 03:11:28.292000 140345940756288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320588 closing signal SIGTERM -W0703 03:11:28.292000 140345940756288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320589 closing signal SIGTERM -W0703 03:11:28.292000 140345940756288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320590 closing signal SIGTERM -W0703 03:11:28.293000 140345940756288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320591 closing signal SIGTERM -W0703 03:11:28.293000 140345940756288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320592 closing signal SIGTERM -W0703 03:11:28.293000 140345940756288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320593 closing signal SIGTERM -W0703 03:11:28.293000 140345940756288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 320594 closing signal SIGTERM -W0703 03:11:28.309000 140713406908224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 203249 closing signal SIGTERM -W0703 03:11:28.310000 140713406908224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 203250 closing signal SIGTERM -W0703 03:11:28.310000 140713406908224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 203251 closing signal SIGTERM -W0703 03:11:28.310000 140713406908224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 203252 closing signal SIGTERM -W0703 03:11:28.310000 140713406908224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 203253 closing signal SIGTERM -W0703 03:11:28.310000 140713406908224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 203254 closing signal SIGTERM -W0703 03:11:28.310000 140713406908224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 203255 closing signal SIGTERM -W0703 03:11:28.310000 140713406908224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 203256 closing signal SIGTERM -E0703 03:11:28.368000 139924922677056 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 792266) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:11:28.380000 139924922677056 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_792184_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:28.412000 139924922677056 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_792184_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:28.440000 139924922677056 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_792184_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:11:28 - host : ip-26-0-163-147.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 792267) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 792267 -[2]: - time : 2024-07-03_03:11:28 - host : ip-26-0-163-147.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 792268) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 792268 -[3]: - time : 2024-07-03_03:11:28 - host : ip-26-0-163-147.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 792269) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 792269 -[4]: - time : 2024-07-03_03:11:28 - host : ip-26-0-163-147.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 792270) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 792270 -[5]: - time : 2024-07-03_03:11:28 - host : ip-26-0-163-147.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 792271) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 792271 -[6]: - time : 2024-07-03_03:11:28 - host : ip-26-0-163-147.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 792272) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 792272 -[7]: - time : 2024-07-03_03:11:28 - host : ip-26-0-163-147.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 792273) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 792273 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:11:28 - host : ip-26-0-163-147.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 792266) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 792266 -============================================================ -E0703 03:11:28.504000 139894981277504 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 834550) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:11:28.516000 139894981277504 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_834480_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:28.547000 139894981277504 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_834480_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:28.575000 139894981277504 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_834480_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:11:28 - host : ip-26-0-174-36.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 834551) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 834551 -[2]: - time : 2024-07-03_03:11:28 - host : ip-26-0-174-36.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 834552) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 834552 -[3]: - time : 2024-07-03_03:11:28 - host : ip-26-0-174-36.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 834553) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 834553 -[4]: - time : 2024-07-03_03:11:28 - host : ip-26-0-174-36.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 834554) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 834554 -[5]: - time : 2024-07-03_03:11:28 - host : ip-26-0-174-36.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 834555) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 834555 -[6]: - time : 2024-07-03_03:11:28 - host : ip-26-0-174-36.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 834556) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 834556 -[7]: - time : 2024-07-03_03:11:28 - host : ip-26-0-174-36.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 834557) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 834557 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:11:28 - host : ip-26-0-174-36.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 834550) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 834550 -============================================================ -E0703 03:11:28.903000 140111955285824 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 894858) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:11:28.915000 140111955285824 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_894787_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:28.950000 140111955285824 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_894787_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:28.976000 140111955285824 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_894787_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:11:28 - host : ip-26-0-165-24.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 894859) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 894859 -[2]: - time : 2024-07-03_03:11:28 - host : ip-26-0-165-24.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 894860) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 894860 -[3]: - time : 2024-07-03_03:11:28 - host : ip-26-0-165-24.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 894861) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 894861 -[4]: - time : 2024-07-03_03:11:28 - host : ip-26-0-165-24.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 894863) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 894863 -[5]: - time : 2024-07-03_03:11:28 - host : ip-26-0-165-24.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 894864) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 894864 -[6]: - time : 2024-07-03_03:11:28 - host : ip-26-0-165-24.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 894865) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 894865 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:11:28 - host : ip-26-0-165-24.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 894858) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 894858 -============================================================ -srun: error: ip-26-0-163-147: task 1: Exited with exit code 1 -E0703 03:11:29.608000 140668047779648 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 405451) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:11:29.622000 140668047779648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_405377_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:29.651000 140668047779648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_405377_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-174-36: task 7: Exited with exit code 1 -W0703 03:11:29.668000 140668047779648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_405377_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:11:28 - host : ip-26-0-164-207.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 405455) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 405455 -[2]: - time : 2024-07-03_03:11:28 - host : ip-26-0-164-207.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 405457) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 405457 -[3]: - time : 2024-07-03_03:11:28 - host : ip-26-0-164-207.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 405458) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 405458 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:11:28 - host : ip-26-0-164-207.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 405451) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 405451 -============================================================ -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 2: Exited with exit code 1 -W0703 03:11:30.991000 140345940756288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_320517_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:31.005000 140345940756288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_320517_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 03:11:31.677000 140088873154368 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_321809_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:31.691000 140088873154368 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_321809_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 03:11:31.797000 140713406908224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-139.ec2.internal_203179_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:11:31.811000 140713406908224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-139.ec2.internal_203179_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-173-246: task 6: Exited with exit code 1 -srun: error: ip-26-0-169-247: task 5: Exited with exit code 1 -srun: error: ip-26-0-169-139: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/status.txt b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-32/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/bench.slurm deleted file mode 100644 index d4ce4ecb01879aae2d3da3613124c76a5174386c..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/config.yaml b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/config.yaml deleted file mode 100644 index c02a494433004cdbaf7136735ea2e90454b64810..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 4 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 256 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 4 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/log.out b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/log.out deleted file mode 100644 index bb039d0c3d561bd87e8e0307799de491550dd831..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/log.out +++ /dev/null @@ -1,5727 +0,0 @@ -======================== -START TIME: Tue Jul 2 23:26:16 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0702 23:26:21.655000 139850683533120 torch/distributed/run.py:757] -W0702 23:26:21.655000 139850683533120 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.655000 139850683533120 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:26:21.655000 139850683533120 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.752000 139855107929920 torch/distributed/run.py:757] -W0702 23:26:21.752000 139855107929920 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.752000 139855107929920 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:26:21.752000 139855107929920 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.886000 140580253726528 torch/distributed/run.py:757] -W0702 23:26:21.886000 140580253726528 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.886000 140580253726528 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:26:21.886000 140580253726528 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.886000 139668319082304 torch/distributed/run.py:757] -W0702 23:26:21.886000 139668319082304 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.886000 139668319082304 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:26:21.886000 139668319082304 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.980000 140582428587840 torch/distributed/run.py:757] -W0702 23:26:21.980000 140582428587840 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.980000 140582428587840 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:26:21.980000 140582428587840 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.986000 140421053228864 torch/distributed/run.py:757] -W0702 23:26:21.986000 140421053228864 torch/distributed/run.py:757] ***************************************** -W0702 23:26:21.986000 140421053228864 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:26:21.986000 140421053228864 torch/distributed/run.py:757] ***************************************** -W0702 23:26:22.050000 140357880190784 torch/distributed/run.py:757] -W0702 23:26:22.050000 140357880190784 torch/distributed/run.py:757] ***************************************** -W0702 23:26:22.050000 140357880190784 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:26:22.050000 140357880190784 torch/distributed/run.py:757] ***************************************** -W0702 23:26:22.123000 139704473855808 torch/distributed/run.py:757] -W0702 23:26:22.123000 139704473855808 torch/distributed/run.py:757] ***************************************** -W0702 23:26:22.123000 139704473855808 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:26:22.123000 139704473855808 torch/distributed/run.py:757] ***************************************** -[default0]:07/02/2024 23:26:48 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=4, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=4, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=256, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4')), -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/02/2024 23:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=2|ip-26-0-162-233]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=4|ip-26-0-162-233]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=6|ip-26-0-162-233]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=14|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=11|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=9|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=14|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=9|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=0|ip-26-0-162-233]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=1|ip-26-0-162-233]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=11|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=12|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=10|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=13|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=15|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=8|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=8|ip-26-0-171-102]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=3|ip-26-0-162-233]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=5|ip-26-0-162-233]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=10|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=10|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=10|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=7|ip-26-0-162-233]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=2|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=9|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=15|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=15|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=9|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=13|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=12|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=8|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=8|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-153]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-153]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-78]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-78]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=1|TP=15|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=6|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=6|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=13|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=13|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=7|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=7|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=11|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=11|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=14|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=14|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=12|ip-26-0-171-88]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=12|ip-26-0-171-88]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=5|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=5|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=4|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=4|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-62]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-62]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/02/2024 23:27:06 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/02/2024 23:27:06 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/02/2024 23:27:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/02/2024 23:27:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/02/2024 23:27:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 24.8M out of 24.8M (100.00%) params' optimizer states -[default0]:07/02/2024 23:27:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/02/2024 23:27:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/02/2024 23:27:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:27:10 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:27:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/02/2024 23:27:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/02/2024 23:27:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/02/2024 23:27:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-02 23:27:12.188738 | mbs: 4 | grad_accum: 256 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/02/2024 23:27:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/02/2024 23:27:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 244.38MiB. Peak allocated 244.38MiB. Peak reserved: 266.00MiB -[default2]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=9|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=11|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=8|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=10|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=13|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=0|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=6|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=3|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=2|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=5|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=15|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=7|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=1|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=14|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=12|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:27:12 [WARNING|DP=0|PP=1|TP=4|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:27:12 [WARNING|DP=0|PP=3|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:27:12 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:27:12 [WARNING|DP=0|PP=2|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank9]: send_activation() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank9]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank9]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank9]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank9]: dist.send( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank9]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank6]: send_activation() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank6]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank6]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank6]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank6]: dist.send( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank6]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank11]: send_activation() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank11]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank11]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank11]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank11]: dist.send( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank11]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank5]: send_activation() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank5]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank5]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank5]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank5]: dist.send( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank5]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank17]: send_activation() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank17]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank17]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank17]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank17]: dist.send( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank17]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank19]: send_activation() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank19]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank19]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank19]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank19]: dist.send( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank19]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank27]: send_activation() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank27]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank27]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank27]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank27]: dist.send( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank27]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank25]: send_activation() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank25]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank25]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank25]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank25]: dist.send( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank25]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank0]: send_activation() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank0]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank0]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank0]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank0]: dist.send( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank0]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank23]: send_activation() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank23]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank23]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank23]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank23]: dist.send( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank23]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank1]: send_activation() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank1]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank1]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank1]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank1]: dist.send( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank1]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank21]: send_activation() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank21]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank21]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank21]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank21]: dist.send( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank21]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank24]: send_activation() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank24]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank24]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank24]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank24]: dist.send( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank24]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f09c05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f0aedec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f0aee3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f0aee4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3f5697de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3f5b9c4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3f5b78f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f09c05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f0aedec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f0aee3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f0aee4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3f5697de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3f5b9c4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3f5b78f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f09c05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3f0ab68119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f3f5697de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f3f5b9c4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f3f5b78f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank8]: send_activation() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank8]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank8]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank8]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank8]: dist.send( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank8]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f7248f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f73768c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f7376da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f7376edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3fbf207e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3fc424e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3fc4019353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f7248f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f73768c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f7376da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f7376edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3fbf207e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3fc424e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3fc4019353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f7248f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f3f733f2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f3fbf207e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f3fc424e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f3fc4019353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank15]: send_activation() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank15]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank15]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank15]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank15]: dist.send( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank15]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94fd003897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94fe2dcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94fe2e1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94fe2e2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9549d7be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f954edc2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f954eb8d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94fd003897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94fe2dcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94fe2e1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94fe2e2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9549d7be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f954edc2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f954eb8d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94fd003897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f94fdf66119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f9549d7be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f954edc2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f954eb8d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank31]: send_activation() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank31]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank31]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank31]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank31]: dist.send( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank31]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank3]: send_activation() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank3]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank3]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank3]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank3]: dist.send( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank3]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank4]: send_activation() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank4]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank4]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank4]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank4]: dist.send( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank4]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank29]: send_activation() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank29]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank29]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank29]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank29]: dist.send( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank29]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank10]: send_activation() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank10]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank10]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank10]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank10]: dist.send( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank10]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank16]: send_activation() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank16]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank16]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank16]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank16]: dist.send( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank16]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4d89d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4d9caec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4d9cb3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4d9cb4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa52574de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa52a794609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa52a55f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4d89d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4d9caec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4d9cb3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4d9cb4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa52574de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa52a794609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa52a55f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4d89d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fa4d9938119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fa52574de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fa52a794609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fa52a55f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f187c3c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f187d6a2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f187d6a7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f187d6a8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f18c9141e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f18ce188609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f18cdf53353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f187c3c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f187d6a2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f187d6a7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f187d6a8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f18c9141e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f18ce188609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f18cdf53353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f187c3c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f187d32c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f18c9141e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f18ce188609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f18cdf53353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f742b99c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f742cc75c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f742cc7aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f742cc7bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:frame #4: + 0xd3e95 (0x7f7478714e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f747d75b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:frame #6: clone + 0x43 (0x7f747d526353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f742b99c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f742cc75c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f742cc7aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank7]: send_activation() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank7]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f742cc7bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank7]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:frame #4: + 0xd3e95 (0x7f7478714e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f747d75b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f747d526353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank7]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank7]: dist.send( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f742b99c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:frame #1: + 0xe32119 (0x7f742c8ff119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank7]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:frame #2: + 0xd3e95 (0x7f7478714e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f747d75b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f747d526353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: Traceback (most recent call last): -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: send_activation() -[default6]:[rank30]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank26]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank30]: send_activation() -[default2]:[rank26]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank26]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank26]: dist.send( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank2]: send_activation() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank2]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank2]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank2]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank2]: dist.send( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank2]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank12]: send_activation() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank12]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank12]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: dist.send( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank13]: send_activation() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank13]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank13]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank13]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank13]: dist.send( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank13]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank30]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank26]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank30]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank30]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank30]: dist.send( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank30]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank28]: send_activation() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank28]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank28]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank28]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank28]: dist.send( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank28]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank22]: send_activation() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank22]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank22]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank22]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank22]: dist.send( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default6]:[rank22]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank18]: send_activation() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank18]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank18]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank18]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank18]: dist.send( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank18]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank14]: send_activation() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank14]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank14]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank14]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank14]: dist.send( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank14]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank20]: send_activation() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank20]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank20]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank20]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank20]: dist.send( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank20]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb05bada897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb05cdb3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb05cdb8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb05cdb9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb0a8852e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb0ad899609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb0ad664353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb05bada897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb05cdb3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogH[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -andler() + 0x1a0 (0x7fb05cdb8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb05cdb9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb0a8852e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb0ad899609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb0ad664353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb05bada897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fb05ca3d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fb0a8852e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fb0ad899609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fb0ad664353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb125d8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffb138b1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffb138b6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffb138b7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ffb5f350e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ffb64397609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ffb64162353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb125d8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffb138b1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffb138b6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffb138b7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ffb5f350e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ffb64397609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ffb64162353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb125d8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ffb1353b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ffb5f350e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ffb64397609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ffb64162353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dff7f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0e00ad1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0e00ad6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0e00ad7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f0e4c570e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0e515b7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0e51382353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dff7f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0e00ad1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0e00ad6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0e00ad7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f0e4c570e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0e515b7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0e51382353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dff7f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f0e0075b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f0e4c570e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f0e515b7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f0e51382353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f55640ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5565384c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5565389a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f556538adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f55b0e23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f55b5e6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f55b5c35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f55640ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5565384c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5565389a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f556538adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f55b0e23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f55b5e6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f55b5c35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f55640ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f556500e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f55b0e23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f55b5e6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f55b5c35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa61dce5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa61efbec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa61efc3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa61efc4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa66aa5de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa66faa4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa66f86f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa61dce5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa61efbec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa61efc3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa61efc4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa66aa5de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa66faa4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa66f86f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa61dce5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fa61ec48119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fa66aa5de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fa66faa4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fa66f86f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f97e726e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f97e8547c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f97e854ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f97e854ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9833fe6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f983902d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f9838df8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f97e726e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f97e8547c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f97e854ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f97e854ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9833fe6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f983902d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f9838df8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f97e726e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f97e81d1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f9833fe6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f983902d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f9838df8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48dccb5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f48ddf8ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f48ddf93a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f48ddf94dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4929a2de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f492ea74609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f492e83f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48dccb5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f48ddf8ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f48ddf93a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f48ddf94dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4929a2de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f492ea74609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f492e83f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48dccb5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f48ddc18119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f4929a2de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f492ea74609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f492e83f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf97323897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf985fcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf98601a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf98602dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fdfe409be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fdfe90e2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fdfe8ead353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf97323897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf985fcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf98601a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf98602dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fdfe409be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fdfe90e2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fdfe8ead353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf97323897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fdf98286119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fdfe409be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fdfe90e2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fdfe8ead353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbf50f75897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbf5224ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbf52253a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbf52254dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fbf9dcede95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fbfa2d34609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fbfa2aff353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbf50f75897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbf5224ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbf52253a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbf52254dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fbf9dcede95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fbfa2d34609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fbfa2aff353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbf50f75897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fbf51ed8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fbf9dcede95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fbfa2d34609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fbfa2aff353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc4d0ee4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc4d21bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc4d21c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc4d21c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc51dc5ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc522ca3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc522a6e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc4d0ee4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc4d21bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc4d21c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc4d21c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc51dc5ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc522ca3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc522a6e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc4d0ee4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fc4d1e47119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fc51dc5ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fc522ca3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fc522a6e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41e4fbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f41e6297c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f41e629ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f41e629ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4231d36e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4236d7d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4236b48353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41e4fbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f41e6297c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f41e629ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f41e629ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4231d36e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4236d7d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4236b48353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41e4fbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f41e5f21119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f4231d36e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f4236d7d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f4236b48353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0012087897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0013360c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0013365a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0013366dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f005edffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0063e46609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0063c11353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0012087897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0013360c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0013365a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0013366dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f005edffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0063e46609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0063c11353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0012087897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f0012fea119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f005edffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f0063e46609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f0063c11353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb5e4aa8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb5e5d81c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb5e5d86a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb5e5d87dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fb631820e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fb636867609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb636632353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb5e4aa8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb5e5d81c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb5e5d86a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb5e5d87dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fb631820e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fb636867609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb636632353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6de5410897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6de66e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb5e4aa8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6de66eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6de66efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: + 0xe32119 (0x7fb5e5a0b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fb631820e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fb636867609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fb636632353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #4: + 0xd3e95 (0x7f6e32188e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: -[default5]:frame #5: + 0x8609 (0x7f6e371cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f6e36f9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6de5410897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6de66e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6de66eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6de66efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f6e32188e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f6e371cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f6e36f9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6de5410897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f6de6373119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f6e32188e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f6e371cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f6e36f9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f29b9269897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f29ba542c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f29ba547a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f29ba548dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2a05fe1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2a0b028609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2a0adf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f29b9269897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f29ba542c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f29ba547a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f29ba548dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2a05fe1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2a0b028609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2a0adf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f29b9269897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f29ba1cc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f2a05fe1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f2a0b028609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f2a0adf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank36]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank36]: grad_accumulator.backward(sum(activations)) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank36]: result = loss.backward() -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank36]: torch.autograd.backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank36]: _engine_run_backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank36]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank36]: return user_fn(self, *args) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank36]: self.grads_buffer.append(recv_grad()) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank37]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank37]: grad_accumulator.backward(sum(activations)) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank37]: result = loss.backward() -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank37]: torch.autograd.backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank37]: _engine_run_backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank37]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank37]: return user_fn(self, *args) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank37]: self.grads_buffer.append(recv_grad()) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank38]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank38]: grad_accumulator.backward(sum(activations)) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank38]: result = loss.backward() -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank38]: torch.autograd.backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank38]: _engine_run_backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank38]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank38]: return user_fn(self, *args) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank38]: self.grads_buffer.append(recv_grad()) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank32]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank32]: grad_accumulator.backward(sum(activations)) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank32]: result = loss.backward() -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank32]: torch.autograd.backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank32]: _engine_run_backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank32]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank32]: return user_fn(self, *args) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank32]: self.grads_buffer.append(recv_grad()) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank35]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank35]: grad_accumulator.backward(sum(activations)) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank35]: result = loss.backward() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank35]: torch.autograd.backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank35]: _engine_run_backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank35]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank35]: return user_fn(self, *args) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank35]: self.grads_buffer.append(recv_grad()) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank33]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank33]: grad_accumulator.backward(sum(activations)) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank33]: result = loss.backward() -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank33]: torch.autograd.backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank33]: _engine_run_backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank33]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank33]: return user_fn(self, *args) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank33]: self.grads_buffer.append(recv_grad()) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b4eadb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b4fdb4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b4fdb9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b4fdbadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f2b9b853e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f2ba089a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f2ba0665353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b4eadb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b4fdb4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b4fdb9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b4fdbadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f2b9b853e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f2ba089a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f2ba0665353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b4eadb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f2b4fa3e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f2b9b853e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f2ba089a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f2ba0665353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank42]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank42]: grad_accumulator.backward(sum(activations)) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank42]: result = loss.backward() -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank42]: torch.autograd.backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank42]: _engine_run_backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank42]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank42]: return user_fn(self, *args) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank42]: self.grads_buffer.append(recv_grad()) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f97c3e4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f97c5124c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f97c5129a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f97c512adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9810bc3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f9815c0a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f98159d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f97c3e4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f97c5124c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f97c5129a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f97c512adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9810bc3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f9815c0a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f98159d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f97c3e4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f97c4dae119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f9810bc3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f9815c0a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f98159d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce57a53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce58d2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce58d31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce58d32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcea47cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcea9812609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcea95dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce57a53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce58d2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce58d31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce58d32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcea47cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcea9812609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcea95dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce57a53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fce589b6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fcea47cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fcea9812609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fcea95dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank46]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank46]: grad_accumulator.backward(sum(activations)) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank46]: result = loss.backward() -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank46]: torch.autograd.backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank46]: _engine_run_backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank46]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank46]: return user_fn(self, *args) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank46]: self.grads_buffer.append(recv_grad()) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a3c1eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a3d4c4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a3d4c9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a3d4cadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5a88f63e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5a8dfaa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5a8dd75353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a3c1eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a3d4c4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a3d4c9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a3d4cadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:frame #4: + 0xd3e95 (0x7f5a88f63e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default2]:frame #5: + 0x8609 (0x7f5a8dfaa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5a8dd75353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd6e4099897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd6e5372c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd6e5377a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd6e5378dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd730e11e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd735e58609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd735c23353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a3c1eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f5a3d14e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #2: + 0xd3e95 (0x7f5a88f63e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f5a8dfaa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd6e4099897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd6e5372c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd6e5377a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd6e5378dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: clone + 0x43 (0x7f5a8dd75353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:frame #4: + 0xd3e95 (0x7fd730e11e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd735e58609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd735c23353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd6e4099897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fd6e4ffc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fd730e11e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fd735e58609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fd735c23353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3feb215897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fec4eec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fec4f3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fec4f4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4037f8de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f403cfd4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f403cd9f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3feb215897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fec4eec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fec4f3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fec4f4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4037f8de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f403cfd4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f403cd9f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3feb215897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3fec178119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f4037f8de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f403cfd4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f403cd9f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank39]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank39]: grad_accumulator.backward(sum(activations)) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank39]: result = loss.backward() -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank39]: torch.autograd.backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank39]: _engine_run_backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank39]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank39]: return user_fn(self, *args) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank39]: self.grads_buffer.append(recv_grad()) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank45]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank45]: grad_accumulator.backward(sum(activations)) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank45]: result = loss.backward() -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank45]: torch.autograd.backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank45]: _engine_run_backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank45]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank45]: return user_fn(self, *args) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank45]: self.grads_buffer.append(recv_grad()) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank44]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank44]: grad_accumulator.backward(sum(activations)) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank44]: result = loss.backward() -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank44]: torch.autograd.backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank44]: _engine_run_backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank44]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank44]: return user_fn(self, *args) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank44]: self.grads_buffer.append(recv_grad()) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank34]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank34]: grad_accumulator.backward(sum(activations)) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank34]: result = loss.backward() -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank34]: torch.autograd.backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank34]: _engine_run_backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank34]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank34]: return user_fn(self, *args) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank34]: self.grads_buffer.append(recv_grad()) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b4b631897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b4c90ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b4c90fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b4c910dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2b983a9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2b9d3f0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2b9d1bb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b4b631897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b4c90ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b4c90fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b4c910dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2b983a9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2b9d3f0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2b9d1bb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b4b631897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f2b4c594119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f2b983a9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f2b9d3f0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f2b9d1bb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank43]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank43]: grad_accumulator.backward(sum(activations)) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank43]: result = loss.backward() -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank43]: torch.autograd.backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank43]: _engine_run_backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank43]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank43]: return user_fn(self, *args) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank43]: self.grads_buffer.append(recv_grad()) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8ac27a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8ad553c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8ad558a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8ad559dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa8f8ff2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa8fe039609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa8fde04353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8ac27a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8ad553c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8ad558a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8ad559dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa8f8ff2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa8fe039609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa8fde04353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8ac27a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fa8ad1dd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fa8f8ff2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fa8fe039609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fa8fde04353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank40]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank40]: grad_accumulator.backward(sum(activations)) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank40]: result = loss.backward() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank40]: torch.autograd.backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank40]: _engine_run_backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank40]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank40]: return user_fn(self, *args) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank40]: self.grads_buffer.append(recv_grad()) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f106fc24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1070efdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1070f02a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1070f03dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f10bc99ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f10c19e3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f10c17ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f106fc24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1070efdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1070f02a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1070f03dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f10bc99ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f10c19e3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f10c17ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f106fc24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f1070b87119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f10bc99ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f10c19e3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f10c17ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb255f4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb257227c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb25722ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb25722ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb2a2cc6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb2a7d0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb2a7ad8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb255f4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb257227c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb25722ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb25722ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb2a2cc6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb2a7d0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb2a7ad8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb255f4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fb256eb1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb2a2cc6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb2a7d0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb2a7ad8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f028951f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f028a7f8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f028a7fda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f028a7fedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f02d6297e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f02db2de609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f02db0a9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f028951f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f028a7f8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f028a7fda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f028a7fedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f02d6297e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f02db2de609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f02db0a9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f028951f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f028a482119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f02d6297e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f02db2de609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f02db0a9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd59e107897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd59f3e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd59f3e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd59f3e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd5eae7fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd5efec6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd5efc91353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd59e107897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd59f3e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd59f3e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd59f3e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd5eae7fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd5efec6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd5efc91353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd59e107897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd59f06a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd5eae7fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd5efec6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd5efc91353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efed3f1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efed51f6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efed51fba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efed51fcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7eff20c95e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7eff25cdc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7eff25aa7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efed3f1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efed51f6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efed51fba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efed51fcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7eff20c95e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7eff25cdc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7eff25aa7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efed3f1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7efed4e80119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7eff20c95e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7eff25cdc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7eff25aa7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank47]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank47]: grad_accumulator.backward(sum(activations)) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank47]: result = loss.backward() -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank47]: torch.autograd.backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank47]: _engine_run_backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank47]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank47]: return user_fn(self, *args) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank47]: self.grads_buffer.append(recv_grad()) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank41]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank41]: grad_accumulator.backward(sum(activations)) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank41]: result = loss.backward() -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank41]: torch.autograd.backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank41]: _engine_run_backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank41]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank41]: return user_fn(self, *args) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank41]: self.grads_buffer.append(recv_grad()) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f12b5e09897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f12b70e2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f12b70e7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f12b70e8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1302b81e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1307bc8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1307993353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f12b5e09897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f12b70e2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f12b70e7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f12b70e8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1302b81e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1307bc8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1307993353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f12b5e09897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f12b6d6c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f1302b81e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f1307bc8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f1307993353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3fd5b47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fd6e20c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fd6e25a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fd6e26dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f40228bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4027906609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f40276d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3fd5b47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fd6e20c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fd6e25a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fd6e26dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f40228bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4027906609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f40276d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3fd5b47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f3fd6aaa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f40228bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f4027906609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f40276d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d8cd62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d8e03bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d8e040a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d8e041dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2dd9adae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2ddeb21609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2dde8ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d8cd62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d8e03bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d8e040a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d8e041dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2dd9adae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2ddeb21609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2dde8ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d8cd62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f2d8dcc5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f2dd9adae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f2ddeb21609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f2dde8ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5594abb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5595d94c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5595d99a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5595d9adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f55e1833e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f55e687a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f55e6645353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5594abb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5595d94c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5595d99a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5595d9adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f55e1833e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f55e687a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f55e6645353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5594abb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f5595a1e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f55e1833e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f55e687a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f55e6645353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84298ce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f842aba7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f842abaca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f842abaddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f8476646e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f847b68d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f847b458353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84298ce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f842aba7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f842abaca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f842abaddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f8476646e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f847b68d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f847b458353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84298ce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f842a831119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f8476646e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f847b68d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f847b458353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83a1eef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83a31c8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83a31cda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83a31cedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f83eec67e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f83f3cae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f83f3a79353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83a1eef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83a31c8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83a31cda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83a31cedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f83eec67e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f83f3cae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f83f3a79353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83a1eef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f83a2e52119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f83eec67e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f83f3cae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f83f3a79353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f318c49b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f318d774c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f318d779a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f318d77adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f31d9213e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f31de25a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f31de025353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f318c49b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f318d774c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f318d779a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f318d77adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f31d9213e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f31de25a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f31de025353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f318c49b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f318d3fe119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f31d9213e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f31de25a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f31de025353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77b846c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f77b9745c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f77b974aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f77b974bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f78051e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f780a22b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f7809ff6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77b846c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f77b9745c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f77b974aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f77b974bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f78051e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f780a22b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f7809ff6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77b846c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f77b93cf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f78051e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f780a22b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f7809ff6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f395b628897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f395c901c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f395c906a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f395c907dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f39a83a0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f39ad3e7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f39ad1b2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f395b628897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f395c901c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f395c906a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f395c907dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f39a83a0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f39ad3e7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f39ad1b2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f395b628897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f395c58b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f39a83a0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f39ad3e7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f39ad1b2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41496a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f414a97ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f414a983a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f414a984dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f419641de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f419b464609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f419b22f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41496a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f414a97ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f414a983a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f414a984dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f419641de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f419b464609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f419b22f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41496a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f414a608119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f419641de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f419b464609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f419b22f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1047874897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1048b4dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1048b52a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1048b53dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f10945ece95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1099633609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f10993fe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1047874897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1048b4dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1048b52a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1048b53dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f10945ece95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1099633609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f10993fe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1047874897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f10487d7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f10945ece95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f1099633609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f10993fe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef12f79897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef14252c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef14257a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef14258dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fef5fcf1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fef64d38609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fef64b03353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef12f79897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef14252c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef14257a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef14258dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fef5fcf1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fef64d38609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fef64b03353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef12f79897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fef13edc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fef5fcf1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fef64d38609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fef64b03353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb870da5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb87207ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb872083a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb872084dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb8bdb1de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb8c2b64609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb8c292f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb870da5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb87207ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb872083a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb872084dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb8bdb1de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb8c2b64609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb8c292f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb870da5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fb871d08119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fb8bdb1de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fb8c2b64609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fb8c292f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f416360b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f41648e4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f41648e9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f41648eadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f41b0383e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f41b53ca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f41b5195353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f416360b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f41648e4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f41648e9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f41648eadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f41b0383e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f41b53ca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f41b5195353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f416360b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f416456e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f41b0383e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f41b53ca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f41b5195353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48012a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f480257dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4802582a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4802583dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f484e01ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4853063609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f4852e2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48012a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f480257dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4802582a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4802583dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f484e01ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4853063609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f4852e2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48012a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f4802207119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f484e01ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f4853063609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f4852e2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7fa6e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff7fb9bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff7fb9c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff7fb9c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff84745ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff84c4a5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff84c270353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7fa6e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff7fb9bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff7fb9c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff7fb9c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff84745ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff84c4a5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff84c270353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7fa6e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ff7fb649119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff84745ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff84c4a5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ff84c270353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6355372897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f635664bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6356650a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6356651dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f63a20eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f63a7131609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f63a6efc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6355372897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f635664bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6356650a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6356651dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f63a20eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f63a7131609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f63a6efc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6355372897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f63562d5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f63a20eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f63a7131609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f63a6efc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48bee51897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f48c012ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f48c012fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f48c0130dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f490bbc9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f4910c10609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f49109db353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48bee51897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f48c012ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f48c012fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f48c0130dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f490bbc9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f4910c10609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f49109db353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f48bee51897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f48bfdb4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f490bbc9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f4910c10609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f49109db353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c83617897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c848f0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c848f5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c848f6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3cd038fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3cd53d6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3cd51a1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c83617897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c848f0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c848f5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c848f6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3cd038fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3cd53d6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3cd51a1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c83617897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f3c8457a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f3cd038fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f3cd53d6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f3cd51a1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f18dab64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f18dbe3dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f18dbe42a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f18dbe43dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f19278dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f192c923609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f192c6ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f18dab64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f18dbe3dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f18dbe42a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f18dbe43dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f19278dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f192c923609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f192c6ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f18dab64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f18dbac7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f19278dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f192c923609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f192c6ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1aec3b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1aed689c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1aed68ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1aed68fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1b39128e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1b3e16f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1b3df3a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1aec3b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1aed689c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1aed68ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1aed68fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1b39128e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1b3e16f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1b3df3a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1aec3b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f1aed313119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1b39128e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f1b3e16f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f1b3df3a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3aebe47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3aed120c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3aed125a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3aed126dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3b38bbfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3b3dc06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3b3d9d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3aebe47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3aed120c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3aed125a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3aed126dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3b38bbfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3b3dc06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3b3d9d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3aebe47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3aecdaa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f3b38bbfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f3b3dc06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f3b3d9d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7face9cda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faceafb3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faceafb8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faceafb9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fad36a52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fad3ba99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fad3b864353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7face9cda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faceafb3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faceafb8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faceafb9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fad36a52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fad3ba99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fad3b864353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7face9cda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7faceac3d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fad36a52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fad3ba99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fad3b864353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84f5d7e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84f7057c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84f705ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84f705ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8542af6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8547b3d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8547908353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84f5d7e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84f7057c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84f705ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84f705ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8542af6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8547b3d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8547908353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84f5d7e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f84f6ce1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f8542af6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f8547b3d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f8547908353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efee67c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efee7a9dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efee7aa2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efee7aa3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7eff3353ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7eff38583609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7eff3834e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efee67c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efee7a9dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efee7aa2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efee7aa3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7eff3353ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7eff38583609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7eff3834e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efee67c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7efee7727119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7eff3353ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7eff38583609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7eff3834e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a702d0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a715a9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a715aea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a715afdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f5abd048e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5ac208f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f5ac1e5a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a702d0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a715a9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a715aea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a715afdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f5abd048e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5ac208f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f5ac1e5a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a702d0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f5a71233119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f5abd048e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f5ac208f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f5ac1e5a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9eecc40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9eedf19c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9eedf1ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9eedf1fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9f399b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9f3e9ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9f3e7ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9eecc40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9eedf19c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9eedf1ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9eedf1fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9f399b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9f3e9ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9f3e7ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9eecc40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f9eedba3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f9f399b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f9f3e9ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f9f3e7ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc55a255897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc55b52ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc55b533a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc55b534dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc5a6fcde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc5ac014609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc5abddf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc55a255897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc55b52ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc55b533a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc55b534dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc5a6fcde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc5ac014609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc5abddf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc55a255897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fc55b1b8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fc5a6fcde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fc5ac014609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fc5abddf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc603d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc616afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc616b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc616b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7efcad14ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7efcb2195609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7efcb1f60353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc603d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc616afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc616b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc616b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7efcad14ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7efcb2195609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7efcb1f60353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc603d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7efc61339119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7efcad14ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7efcb2195609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7efcb1f60353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dcd7ff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0dcead8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0dceadda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0dceadedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0e1a577e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0e1f5be609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0e1f389353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dcd7ff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0dcead8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0dceadda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0dceadedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0e1a577e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0e1f5be609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0e1f389353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dcd7ff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f0dce762119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f0e1a577e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f0e1f5be609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f0e1f389353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -E0702 23:37:39.999000 139704473855808 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1754111) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:37:39 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1754112) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1754112 -[2]: - time : 2024-07-02_23:37:39 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1754113) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1754113 -[3]: - time : 2024-07-02_23:37:39 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1754114) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1754114 -[4]: - time : 2024-07-02_23:37:39 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1754115) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1754115 -[5]: - time : 2024-07-02_23:37:39 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1754116) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1754116 -[6]: - time : 2024-07-02_23:37:39 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1754117) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1754117 -[7]: - time : 2024-07-02_23:37:39 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1754118) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1754118 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:37:39 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1754111) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1754111 -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -W0702 23:37:43.816000 140352219457280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_1119276_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:44.173000 139845022799616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3870227_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:44.394000 139662658348800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-103.ec2.internal_846371_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:44.497000 140574592993024 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_859579_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:44.519000 139849447196416 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-153.ec2.internal_1398027_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:44.548000 140415392495360 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-162-233.ec2.internal_1376764_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:44.623000 140576767854336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3742111_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:44.843000 139850683533120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3870301 closing signal SIGTERM -W0702 23:37:44.844000 139850683533120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3870303 closing signal SIGTERM -W0702 23:37:44.844000 139850683533120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3870306 closing signal SIGTERM -W0702 23:37:44.844000 139850683533120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3870308 closing signal SIGTERM -W0702 23:37:44.880000 140582428587840 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3742186 closing signal SIGTERM -W0702 23:37:44.880000 140582428587840 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3742187 closing signal SIGTERM -W0702 23:37:44.880000 140582428587840 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3742189 closing signal SIGTERM -W0702 23:37:44.880000 140582428587840 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3742190 closing signal SIGTERM -W0702 23:37:44.880000 140582428587840 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3742193 closing signal SIGTERM -W0702 23:37:44.913000 140421053228864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1376837 closing signal SIGTERM -W0702 23:37:44.914000 140421053228864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1376838 closing signal SIGTERM -W0702 23:37:44.914000 140421053228864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1376839 closing signal SIGTERM -W0702 23:37:44.914000 140421053228864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1376840 closing signal SIGTERM -W0702 23:37:44.914000 140421053228864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1376842 closing signal SIGTERM -W0702 23:37:44.914000 140421053228864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1376843 closing signal SIGTERM -W0702 23:37:44.914000 140421053228864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1376844 closing signal SIGTERM -E0702 23:37:45.008000 140580253726528 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 859653) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:37:45.008000 139668319082304 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 846445) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0702 23:37:45.020000 139668319082304 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_846371_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:45.021000 140580253726528 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_859579_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:45.049000 139668319082304 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_846371_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:45.058000 140580253726528 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_859579_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:45.085000 140580253726528 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_859579_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -W0702 23:37:45.084000 139668319082304 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_846371_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 859654) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 859654 -[2]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 859655) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 859655 -[3]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 859656) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 859656 -[4]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 859657) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 859657 -[5]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 859658) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 859658 -[6]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 859659) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 859659 -[7]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 859660) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 859660 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 859653) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 859653 -============================================================ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 846446) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846446 -[2]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 846447) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846447 -[3]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 846448) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846448 -[4]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 846449) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846449 -[5]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 846450) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846450 -[6]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 846451) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846451 -[7]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 846452) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846452 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 846445) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846445 -============================================================ -E0702 23:37:45.094000 140357880190784 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1119350) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0702 23:37:45.107000 140357880190784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1119276_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0702 23:37:45.110000 139855107929920 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1398101) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0702 23:37:45.123000 139855107929920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1398027_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:45.136000 140357880190784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1119276_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:45.152000 139855107929920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1398027_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:45.164000 140357880190784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1119276_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 1119351) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1119351 -[2]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-78.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 1119352) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1119352 -[3]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 1119353) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1119353 -[4]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : -6 (pid: 1119354) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1119354 -[5]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 1119355) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1119355 -[6]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-78.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 1119356) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1119356 -[7]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 1119357) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1119357 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 1119350) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1119350 -============================================================ -W0702 23:37:45.180000 139855107929920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1398027_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-153.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 1398102) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1398102 -[2]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-153.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 1398103) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1398103 -[3]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-153.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 1398104) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1398104 -[4]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 1398105) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1398105 -[5]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-153.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 1398106) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1398106 -[6]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-153.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 1398107) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1398107 -[7]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-153.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 1398108) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1398108 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:37:44 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 1398101) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1398101 -============================================================ -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -E0702 23:37:46.017000 139850683533120 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 3870302) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0702 23:37:46.029000 139850683533120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3870227_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:46.059000 139850683533120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3870227_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:46.077000 139850683533120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3870227_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : -6 (pid: 3870304) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3870304 -[2]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 3870305) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3870305 -[3]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 3870307) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3870307 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : -6 (pid: 3870302) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3870302 -============================================================ -E0702 23:37:46.491000 140421053228864 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 4 (pid: 1376841) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0702 23:37:46.504000 140421053228864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1376764_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:46.536000 140421053228864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1376764_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:46.546000 140421053228864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1376764_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:37:44 - host : ip-26-0-162-233.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 1376841) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1376841 -============================================================ -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -E0702 23:37:46.888000 140582428587840 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 3742188) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0702 23:37:46.901000 140582428587840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3742111_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:46.935000 140582428587840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3742111_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 23:37:46.948000 140582428587840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3742111_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-102.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 3742191) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3742191 -[2]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-102.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 3742192) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3742192 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:37:44 - host : ip-26-0-171-102.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 3742188) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3742188 -============================================================ -srun: error: ip-26-0-162-233: task 4: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/status.txt b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-4/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/bench.slurm deleted file mode 100644 index ad88e3223dfd4e4c737e8da7fa27a3440309b994..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/config.yaml b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/config.yaml deleted file mode 100644 index 41b073dc7a4bb5cc15d59502c543acae4a256973..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 4 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 16 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 64 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/log.out b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/log.out deleted file mode 100644 index 435a2a897fcc03be566c4b0adae6b36c6a54da77..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/log.out +++ /dev/null @@ -1,1765 +0,0 @@ -======================== -START TIME: Wed Jul 3 02:11:24 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 02:11:27.417000 140025732028224 torch/distributed/run.py:757] -W0703 02:11:27.417000 140025732028224 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.417000 140025732028224 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:11:27.417000 140025732028224 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.417000 140181892396864 torch/distributed/run.py:757] -W0703 02:11:27.417000 140181892396864 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.417000 140181892396864 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:11:27.417000 140181892396864 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.419000 140088508618560 torch/distributed/run.py:757] -W0703 02:11:27.419000 140088508618560 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.419000 140088508618560 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:11:27.419000 140088508618560 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.426000 140471672833856 torch/distributed/run.py:757] -W0703 02:11:27.426000 140471672833856 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.426000 140471672833856 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:11:27.426000 140471672833856 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.428000 140515281438528 torch/distributed/run.py:757] -W0703 02:11:27.428000 140515281438528 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.428000 140515281438528 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:11:27.428000 140515281438528 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.430000 140513055242048 torch/distributed/run.py:757] -W0703 02:11:27.430000 140513055242048 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.430000 140513055242048 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:11:27.430000 140513055242048 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.432000 140355073300288 torch/distributed/run.py:757] -W0703 02:11:27.432000 140355073300288 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.432000 140355073300288 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:11:27.432000 140355073300288 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.498000 139778136758080 torch/distributed/run.py:757] -W0703 02:11:27.498000 139778136758080 torch/distributed/run.py:757] ***************************************** -W0703 02:11:27.498000 139778136758080 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:11:27.498000 139778136758080 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 02:11:48 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=4, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=16, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=64, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=16, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64')), -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 02:11:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=12|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=12|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=12|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=11|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=11|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=11|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=13|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=13|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=13|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=8|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=8|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=8|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=7|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=6|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=6|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=7|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=7|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=6|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=8|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=8|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=8|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=12|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=12|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=12|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=13|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=13|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=13|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=2|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=2|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=2|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=4|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=4|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=4|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=6|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=6|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=9|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=9|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=2|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=2|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=6|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=9|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=7|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=7|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=5|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=5|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=5|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=6|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=6|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=7|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=7|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=7|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=3|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=3|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=3|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.21G (2315.81MiB) -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=9|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=9|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=9|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=11|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=11|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=11|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=0|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=0|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=5|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=1|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=5|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=5|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=1|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=2|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=2|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=2|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=4|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=4|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=4|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=1|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=1|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=3|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=3|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=4|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=4|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=5|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=5|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=14|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=14|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=14|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=0|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=0|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=0|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=3|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=3|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default3]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=3|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=1|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=1|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=1|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=0|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=0|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=0|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=15|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=15|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=15|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=10|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=10|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=1|TP=10|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=10|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=10|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=10|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=15|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=15|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=15|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=14|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=14|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=14|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 02:12:05 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 02:12:05 [INFO|DP=0|PP=3|TP=1|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 02:12:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 02:12:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 02:12:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 24.8M out of 24.8M (100.00%) params' optimizer states -[default0]:07/03/2024 02:12:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 02:12:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 02:12:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 02:12:09 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:12:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 02:12:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 02:12:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 02:12:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 02:12:11.369396 | mbs: 64 | grad_accum: 16 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 02:12:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 02:12:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 244.38MiB. Peak allocated 244.38MiB. Peak reserved: 266.00MiB -[default3]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=3|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=8|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=0|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=13|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=10|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=4|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=11|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=0|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=1|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=6|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=7|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=8|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=4|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=15|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=9|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=15|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=3|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=9|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=13|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:12:11 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=12|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=2|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=6|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=7|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=14|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=2|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=5|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=12|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 02:12:11 [WARNING|DP=0|PP=1|TP=5|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=11|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 02:12:11 [WARNING|DP=0|PP=2|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:12:11 [WARNING|DP=0|PP=3|TP=1|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 02:12:12 [WARNING|DP=0|PP=1|TP=14|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 02:12:12 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 02:12:12 [WARNING|DP=0|PP=2|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 02:12:12 [WARNING|DP=0|PP=3|TP=10|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 02:12:12 [WARNING|DP=0|PP=2|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 02:12:12 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default3]:[rank11]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: sharded_logits = self.model( -[default2]:[rank10]: Traceback (most recent call last): -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: output = model(**micro_batch) -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: sharded_logits = self.model( -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: sharded_logits = self.model( -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: trainer.train(dataloader) -[default4]:[rank12]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default4]:[rank12]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: output = self.o_proj(attention_output) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 79.94 MiB is free. Including non-PyTorch memory, this process has 79.24 GiB memory in use. Of the allocated memory 69.33 GiB is allocated by PyTorch, and 61.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default2]:[rank10]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 79.94 MiB is free. Including non-PyTorch memory, this process has 79.24 GiB memory in use. Of the allocated memory 69.33 GiB is allocated by PyTorch, and 61.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 39.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 59.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: output = self.o_proj(attention_output) -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 103.94 MiB is free. Including non-PyTorch memory, this process has 79.22 GiB memory in use. Of the allocated memory 69.33 GiB is allocated by PyTorch, and 125.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return row_linear( -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 39.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 59.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default4]:[rank12]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 79.94 MiB is free. Including non-PyTorch memory, this process has 79.24 GiB memory in use. Of the allocated memory 69.33 GiB is allocated by PyTorch, and 61.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 39.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 59.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default0]:[rank8]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 575.94 MiB is free. Including non-PyTorch memory, this process has 78.76 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 123.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 487.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 123.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: Traceback (most recent call last): -[default2]:[rank2]: Traceback (most recent call last): -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: trainer.train(dataloader) -[default1]:[rank1]: trainer.train(dataloader) -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: output = model(**micro_batch) -[default0]:[rank0]: output = model(**micro_batch) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default3]:[rank3]: output = model(**micro_batch) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default2]:[rank2]: sharded_logits = self.model( -[default1]:[rank1]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: sharded_logits = self.model( -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: output = self.o_proj(attention_output) -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return row_linear( -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 487.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 123.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: output = self.o_proj(attention_output) -[default0]:[rank0]: output = self.o_proj(attention_output) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return row_linear( -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return row_linear( -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 575.94 MiB is free. Including non-PyTorch memory, this process has 78.76 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 123.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 487.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 123.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 487.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 123.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: output = self.o_proj(attention_output) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 575.94 MiB is free. Including non-PyTorch memory, this process has 78.76 GiB memory in use. Of the allocated memory 69.46 GiB is allocated by PyTorch, and 123.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -E0703 02:12:44.911000 140513055242048 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 511973) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:12:44.913000 140515281438528 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1125019) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:12:44 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1125020) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:12:44 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1125021) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:12:44 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1125022) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:12:44 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1125023) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:12:44 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1125024) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:12:44 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1125025) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:12:44 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1125026) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:12:44 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1125019) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 511974) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 511975) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 511976) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 511977) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 511978) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 511979) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 511980) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 511973) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -W0703 02:12:48.783000 140020071294720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-220.ec2.internal_761797_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:49.549000 140082847885056 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1048111_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:49.699000 140349412566784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1822398_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:49.702000 140176231663360 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_890715_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:49.728000 139772476024576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1848651_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:49.747000 140466012100352 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3207894_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:49.797000 140471672833856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3207965 closing signal SIGTERM -W0703 02:12:49.795000 140025732028224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 761868 closing signal SIGTERM -W0703 02:12:49.797000 140471672833856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3207966 closing signal SIGTERM -W0703 02:12:49.795000 140025732028224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 761869 closing signal SIGTERM -W0703 02:12:49.797000 140471672833856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3207967 closing signal SIGTERM -W0703 02:12:49.795000 140025732028224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 761870 closing signal SIGTERM -W0703 02:12:49.797000 140471672833856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3207968 closing signal SIGTERM -W0703 02:12:49.795000 140025732028224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 761871 closing signal SIGTERM -W0703 02:12:49.797000 140025732028224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 761872 closing signal SIGTERM -W0703 02:12:49.798000 140471672833856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3207969 closing signal SIGTERM -W0703 02:12:49.799000 140088508618560 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1048182 closing signal SIGTERM -W0703 02:12:49.797000 140025732028224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 761873 closing signal SIGTERM -W0703 02:12:49.799000 140088508618560 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1048183 closing signal SIGTERM -W0703 02:12:49.797000 140025732028224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 761874 closing signal SIGTERM -W0703 02:12:49.799000 140088508618560 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1048184 closing signal SIGTERM -W0703 02:12:49.799000 140088508618560 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1048185 closing signal SIGTERM -W0703 02:12:49.798000 140025732028224 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 761875 closing signal SIGTERM -W0703 02:12:49.799000 140181892396864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890786 closing signal SIGTERM -W0703 02:12:49.801000 140088508618560 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1048186 closing signal SIGTERM -W0703 02:12:49.799000 140181892396864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890787 closing signal SIGTERM -W0703 02:12:49.800000 139778136758080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1848721 closing signal SIGTERM -W0703 02:12:49.801000 139778136758080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1848722 closing signal SIGTERM -W0703 02:12:49.801000 139778136758080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1848723 closing signal SIGTERM -W0703 02:12:49.802000 140471672833856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3207970 closing signal SIGTERM -W0703 02:12:49.799000 140181892396864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890788 closing signal SIGTERM -W0703 02:12:49.800000 140181892396864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890789 closing signal SIGTERM -W0703 02:12:49.801000 140181892396864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890790 closing signal SIGTERM -W0703 02:12:49.801000 139778136758080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1848724 closing signal SIGTERM -W0703 02:12:49.801000 140181892396864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890791 closing signal SIGTERM -W0703 02:12:49.804000 140471672833856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3207971 closing signal SIGTERM -W0703 02:12:49.804000 140471672833856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3207972 closing signal SIGTERM -W0703 02:12:49.802000 140181892396864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890792 closing signal SIGTERM -W0703 02:12:49.804000 140088508618560 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1048187 closing signal SIGTERM -W0703 02:12:49.804000 140088508618560 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1048188 closing signal SIGTERM -W0703 02:12:49.803000 140181892396864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890793 closing signal SIGTERM -W0703 02:12:49.803000 140355073300288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1822468 closing signal SIGTERM -W0703 02:12:49.803000 140355073300288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1822469 closing signal SIGTERM -W0703 02:12:49.805000 140088508618560 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1048189 closing signal SIGTERM -W0703 02:12:49.803000 140355073300288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1822470 closing signal SIGTERM -W0703 02:12:49.805000 139778136758080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1848725 closing signal SIGTERM -W0703 02:12:49.804000 140355073300288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1822471 closing signal SIGTERM -W0703 02:12:49.806000 139778136758080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1848726 closing signal SIGTERM -W0703 02:12:49.806000 139778136758080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1848727 closing signal SIGTERM -W0703 02:12:49.806000 139778136758080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1848728 closing signal SIGTERM -W0703 02:12:49.806000 140355073300288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1822472 closing signal SIGTERM -W0703 02:12:49.807000 140355073300288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1822473 closing signal SIGTERM -W0703 02:12:49.807000 140355073300288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1822474 closing signal SIGTERM -W0703 02:12:49.808000 140355073300288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1822475 closing signal SIGTERM -W0703 02:12:53.787000 140020071294720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-220.ec2.internal_761797_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:54.554000 140082847885056 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1048111_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:54.704000 140349412566784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1822398_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:54.707000 140176231663360 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_890715_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:54.732000 139772476024576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1848651_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:54.752000 140466012100352 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3207894_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:58.791000 140020071294720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-220.ec2.internal_761797_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:58.935000 140181892396864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_890715_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:58.946000 140181892396864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_890715_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -W0703 02:12:59.558000 140082847885056 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1048111_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:59.708000 140349412566784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1822398_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:59.738000 139772476024576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1848651_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:12:59.756000 140466012100352 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3207894_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:13:02.744000 140088508618560 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1048111_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:13:02.754000 140088508618560 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1048111_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -W0703 02:13:03.239000 140025732028224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_761797_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:13:03.252000 140025732028224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_761797_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-163-220: task 2: Exited with exit code 1 -W0703 02:13:03.645000 140355073300288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1822398_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:13:03.656000 140355073300288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1822398_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 02:13:03.944000 139778136758080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1848651_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:13:03.957000 139778136758080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1848651_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -W0703 02:13:04.139000 140471672833856 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3207894_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 02:13:04.150000 140471672833856 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3207894_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/status.txt b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-64/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/bench.slurm deleted file mode 100644 index c5f41eb98a0348fa01b7dcf4228ce58a0dfd3dff..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8 llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/config.yaml b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/config.yaml deleted file mode 100644 index 0332848868ac5071a69cc2fdcb3d78ebeb9e589b..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 4 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 128 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 8 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/log.out b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/log.out deleted file mode 100644 index 714fddc5e8ab3e42ba2499170de70c74e5fa40e0..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/log.out +++ /dev/null @@ -1,5665 +0,0 @@ -======================== -START TIME: Wed Jul 3 05:52:17 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 05:52:22.009000 140649742108480 torch/distributed/run.py:757] -W0703 05:52:22.009000 140649742108480 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.009000 140649742108480 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:52:22.009000 140649742108480 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.278000 139911714207552 torch/distributed/run.py:757] -W0703 05:52:22.278000 139911714207552 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.278000 139911714207552 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:52:22.278000 139911714207552 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.279000 139733068760896 torch/distributed/run.py:757] -W0703 05:52:22.279000 139733068760896 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.279000 139733068760896 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:52:22.279000 139733068760896 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.295000 139952964577088 torch/distributed/run.py:757] -W0703 05:52:22.295000 139952964577088 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.295000 139952964577088 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:52:22.295000 139952964577088 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.412000 140035837380416 torch/distributed/run.py:757] -W0703 05:52:22.412000 140035837380416 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.412000 140035837380416 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:52:22.412000 140035837380416 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.486000 140571578464064 torch/distributed/run.py:757] -W0703 05:52:22.486000 140571578464064 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.486000 140571578464064 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:52:22.486000 140571578464064 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.933000 139881489946432 torch/distributed/run.py:757] -W0703 05:52:22.933000 139881489946432 torch/distributed/run.py:757] ***************************************** -W0703 05:52:22.933000 139881489946432 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:52:22.933000 139881489946432 torch/distributed/run.py:757] ***************************************** -W0703 05:52:23.221000 139888602216256 torch/distributed/run.py:757] -W0703 05:52:23.221000 139888602216256 torch/distributed/run.py:757] ***************************************** -W0703 05:52:23.221000 139888602216256 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:52:23.221000 139888602216256 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 05:52:47 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=4, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=16, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=8, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=128, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8')), -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 05:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=1|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=3|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=0|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=0|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=5|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=5|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=7|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=7|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=3|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=3|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=5|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=5|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=5|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=1|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=1|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=1|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=7|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=7|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=7|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=7|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=7|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=5|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=5|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=7|ip-26-0-172-57]: No checkpoint path provided. -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=5|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=3|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=3|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=3|ip-26-0-172-57]: No checkpoint path provided. -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=8|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=8|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=8|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=9|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=9|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=9|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=10|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=10|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=10|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=11|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=11|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=1|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=1|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=11|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=12|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=12|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=12|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=8|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=8|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=8|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=10|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=10|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=10|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=14|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=14|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=14|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=13|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=13|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=13|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=15|ip-26-0-163-226]: Local number of parameters: 18.4M (35.05MiB) -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=15|ip-26-0-163-226]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=15|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=9|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=9|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=9|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=0|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=0|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=0|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=4|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=4|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=4|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=15|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=15|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=15|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=1|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default1]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=1|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=11|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=11|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=11|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=13|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=13|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=13|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=12|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=12|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=3|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=12|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=14|ip-26-0-172-73]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=14|ip-26-0-172-73]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=14|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=2|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=2|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 24.8M (47.33MiB) -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=6|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=6|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=4|ip-26-0-168-238]: Local number of parameters: 15.8M (30.05MiB) -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=4|ip-26-0-168-238]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 24.8M (47.33MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=2|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=2|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 55.07MiB. Peak allocated: 57.10MiB Peak reserved: 74.00MiB -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=2|ip-26-0-163-220]: No checkpoint path provided. -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=6|ip-26-0-163-220]: Local number of parameters: 18.4M (35.05MiB) -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=6|ip-26-0-163-220]: [After model building] Memory usage: 43.07MiB. Peak allocated: 45.10MiB Peak reserved: 60.00MiB -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=6|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 05:53:05 [INFO|DP=0|PP=1|TP=3|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=0|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=0|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default0]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=0|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-86]: Local number of parameters: 15.8M (30.05MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-86]: [After model building] Memory usage: 37.06MiB. Peak allocated: 39.09MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=2|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=2|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=2|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default2]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=2|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=4|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=4|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default4]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=4|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=6|ip-26-0-172-57]: Local number of parameters: 16.9M (32.31MiB) -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=6|ip-26-0-172-57]: [After model building] Memory usage: 36.32MiB. Peak allocated: 38.35MiB Peak reserved: 48.00MiB -[default6]:07/03/2024 05:53:05 [INFO|DP=0|PP=3|TP=6|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 05:53:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 05:53:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 05:53:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 24.8M out of 24.8M (100.00%) params' optimizer states -[default0]:07/03/2024 05:53:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 05:53:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 05:53:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 05:53:09 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:53:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 05:53:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 05:53:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 05:53:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 05:53:11.413892 | mbs: 8 | grad_accum: 128 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 05:53:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 05:53:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 244.38MiB. Peak allocated 244.38MiB. Peak reserved: 266.00MiB -[default1]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=9|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=10|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=8|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=0|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=12|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=14|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=1|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=10|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=2|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=3|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=7|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=0|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=1|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=7|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=4|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=3|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=6|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=8|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=11|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=12|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=15|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=13|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=14|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=15|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=13|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=4|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=11|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=5|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=2|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=9|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:53:11 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:53:11 [WARNING|DP=0|PP=1|TP=6|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:53:11 [WARNING|DP=0|PP=3|TP=5|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:53:11 [WARNING|DP=0|PP=2|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank27]: send_activation() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank27]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank27]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank27]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank27]: dist.send( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank27]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank21]: send_activation() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank21]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank21]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank21]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank21]: dist.send( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank21]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank24]: send_activation() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank24]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank24]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank24]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank24]: dist.send( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank24]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank2]: send_activation() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank2]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank2]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank2]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank2]: dist.send( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank2]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank0]: send_activation() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank0]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank0]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank0]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank0]: dist.send( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank0]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank8]: send_activation() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank8]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank8]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank8]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank8]: dist.send( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank8]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank10]: send_activation() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank10]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank10]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank10]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank10]: dist.send( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank10]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank18]: send_activation() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank18]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank18]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank18]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank18]: dist.send( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank18]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank20]: send_activation() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank20]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank20]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank20]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank20]: dist.send( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank20]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank4]: send_activation() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank4]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank4]: dist.send( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank6]: send_activation() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank6]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank6]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank6]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank6]: dist.send( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank6]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank16]: send_activation() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank16]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank16]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank16]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank16]: dist.send( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank16]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank3]: send_activation() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank3]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank3]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank3]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank3]: dist.send( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank3]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank9]: send_activation() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank9]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank9]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank9]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank9]: dist.send( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank9]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank13]: send_activation() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank13]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank13]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank13]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank13]: dist.send( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank13]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank15]: send_activation() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank15]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank15]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank15]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank15]: dist.send( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank15]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank30]: send_activation() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank30]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank30]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank30]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank30]: dist.send( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank30]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank17]: send_activation() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank17]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank19]: send_activation() -[default1]:[rank17]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank19]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank19]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank19]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank17]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank19]: dist.send( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: dist.send( -[default3]:[rank19]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank17]: return func(*args, **kwargs) -[default3]:[rank19]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank17]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank7]: send_activation() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank7]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank7]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank7]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank7]: dist.send( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank7]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank12]: send_activation() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank12]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank12]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank12]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank12]: dist.send( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank12]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank14]: send_activation() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank14]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank14]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank14]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank14]: dist.send( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank14]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank26]: send_activation() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank26]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank26]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank26]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank26]: dist.send( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank26]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank25]: send_activation() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank25]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank25]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank25]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank25]: dist.send( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank25]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank29]: send_activation() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank29]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank29]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank29]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank29]: dist.send( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank29]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank28]: send_activation() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank28]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank28]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank28]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank28]: dist.send( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank28]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank31]: send_activation() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank31]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank31]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank31]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank31]: dist.send( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank31]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd2c3f96897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd2c526fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd2c5274a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd2c5275dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fd310d0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fd315d55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fd315b20353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd2c3f96897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd2c526fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd2c5274a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd2c5275dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fd310d0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fd315d55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fd315b20353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd2c3f96897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fd2c4ef9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fd310d0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fd315d55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fd315b20353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank22]: send_activation() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank22]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank22]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank22]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank22]: dist.send( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank22]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank23]: send_activation() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank23]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank23]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank23]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank23]: dist.send( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank23]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank11]: send_activation() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank11]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank11]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank11]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank11]: dist.send( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank11]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc105a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdc1187fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdc11884a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdc11885dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fdc5d31ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fdc62365609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fdc62130353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc105a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdc1187fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdc11884a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdc11885dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fdc5d31ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fdc62365609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fdc62130353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc105a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fdc11509119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fdc5d31ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fdc62365609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fdc62130353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank1]: send_activation() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank1]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank1]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank1]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank1]: dist.send( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank1]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd04a55897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd05d2ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd05d33a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd05d34dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fdd517cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fdd56814609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fdd565df353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd04a55897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd05d2ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd05d33a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd05d34dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fdd517cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fdd56814609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fdd565df353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd04a55897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fdd059b8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fdd517cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fdd56814609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fdd565df353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff62db1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff62edf6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff62edfba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff62edfcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff67a895e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff67f8dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff67f6a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff62db1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff62edf6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff62edfba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff62edfcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff67a895e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff67f8dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff67f6a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff62db1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7ff62ea80119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7ff67a895e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7ff67f8dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7ff67f6a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8a18841897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8a19b1ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8a19b1fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8a19b20dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f8a655b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f8a6a600609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8a6a3cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8a18841897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8a19b1ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8a19b1fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8a19b20dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f8a655b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f8a6a600609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8a6a3cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8a18841897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f8a197a4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f8a655b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f8a6a600609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f8a6a3cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83a0897897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83a1b70c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83a1b75a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83a1b76dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f83ed60fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f83f2656609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59e6fdd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59e82b6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #6: clone + 0x43 (0x7f83f2421353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59e82bba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59e82bcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]:frame #4: + 0xd3e95 (0x7f5a33d55e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #5: + 0x8609 (0x7f5a38d9c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #6: clone + 0x43 (0x7f5a38b67353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83a0897897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd658331897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]: -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd65960ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83a1b70c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd65960fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83a1b75a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83a1b76dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:frame #4: + 0xd3e95 (0x7f83ed60fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd659610dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59e6fdd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #5: + 0x8609 (0x7f83f2656609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59e82b6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd6a50a9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #6: clone + 0x43 (0x7f83f2421353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59e82bba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7fd6aa0f0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59e82bcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]: -[default3]:frame #4: + 0xd3e95 (0x7f5a33d55e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #5: + 0x8609 (0x7f5a38d9c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83a0897897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #6: clone + 0x43 (0x7f5a38b67353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #6: clone + 0x43 (0x7fd6a9ebb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:frame #1: + 0xe32119 (0x7f83a17fa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #2: + 0xd3e95 (0x7f83ed60fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:frame #3: + 0x8609 (0x7f83f2656609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59e6fdd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #4: clone + 0x43 (0x7f83f2421353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #1: + 0xe32119 (0x7f59e7f40119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd658331897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]: -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd65960ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f5a33d55e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd65960fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: + 0x8609 (0x7f5a38d9c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd659610dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd6a50a9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #4: clone + 0x43 (0x7f5a38b67353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:frame #5: + 0x8609 (0x7fd6aa0f0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd6a9ebb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd658331897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fd659294119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fd6a50a9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fd6aa0f0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fd6a9ebb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:[rank5]: trainer.train(dataloader) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2cd0d0b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2cd1fe4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2cd1fe9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2cd1feadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2d1da83e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:frame #5: + 0x8609 (0x7f2d22aca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2d22895353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]: -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2cd0d0b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2cd1fe4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2cd1fe9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2cd1feadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2d1da83e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank5]: send_activation() -[default7]:frame #5: + 0x8609 (0x7f2d22aca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank5]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:frame #6: clone + 0x43 (0x7f2d22895353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank5]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank5]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2cd0d0b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f2cd1c6e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank5]: dist.send( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:frame #2: + 0xd3e95 (0x7f2d1da83e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f2d22aca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank5]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:frame #4: clone + 0x43 (0x7f2d22895353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbc3ac1b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbc3bef4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbc3bef9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbc3befadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fbc87993e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fbc8c9da609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fbc8c7a5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbc3ac1b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbc3bef4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbc3bef9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbc3befadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fbc87993e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fbc8c9da609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fbc8c7a5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbc3ac1b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fbc3bb7e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fbc87993e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fbc8c9da609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fbc8c7a5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ec74cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f357f1b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3580489c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3ec87a5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3ec87aaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f358048ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f358048fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3ec87abdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f35cbf28e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f35d0f6f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: + 0xd3e95 (0x7f3f14244e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #6: clone + 0x43 (0x7f35d0d3a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]:frame #5: + 0x8609 (0x7f3f1928b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #6: clone + 0x43 (0x7f3f19056353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f357f1b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3580489c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ec74cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3ec87a5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3ec87aaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3ec87abdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f358048ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3f14244e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3f1928b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f358048fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f35cbf28e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #6: clone + 0x43 (0x7f3f19056353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #5: + 0x8609 (0x7f35d0f6f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ec74cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f3ec842f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f3f14244e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f3f1928b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f35d0d3a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:frame #4: clone + 0x43 (0x7f3f19056353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f357f1b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f3580113119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f35cbf28e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f35d0f6f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f35d0d3a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f129f3d0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f12a06a9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f12a06aea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f12a06afdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f12ec148e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f12f118f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f12f0f5a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66c6b6a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f66c7e43c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f66c7e48a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f66c7e49dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f67138e2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6718929609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f129f3d0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #6: clone + 0x43 (0x7f67186f4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f12a06a9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f12a06aea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f12a06afdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f12ec148e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f12f118f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f12f0f5a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f129f3d0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66c6b6a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f12a0333119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f12ec148e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f66c7e43c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: + 0x8609 (0x7f12f118f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f66c7e48a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: clone + 0x43 (0x7f12f0f5a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f66c7e49dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]: -[default6]:frame #4: + 0xd3e95 (0x7f67138e2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6718929609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f67186f4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66c6b6a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f66c7acd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f67138e2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f6718929609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f67186f4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f03f73a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f03f867bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f03f8680a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f03f8681dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f044411ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f0449161609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f0448f2c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f03f73a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f03f867bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f03f8680a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f03f8681dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f044411ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f0449161609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f0448f2c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f03f73a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f03f8305119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f044411ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f0449161609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f0448f2c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb0dbd97897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb0dd070c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb0dd075a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb0dd076dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb128b0fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb12db56609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb12d921353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb0dbd97897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb0dd070c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb0dd075a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb0dd076dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb128b0fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb12db56609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb12d921353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb0dbd97897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fb0dccfa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fb128b0fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fb12db56609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fb12d921353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f45d9a1c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f45dacf5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f45dacfaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f45dacfbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4626794e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f462b7db609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f462b5a6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f45d9a1c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f45dacf5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f45dacfaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f45dacfbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4626794e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f462b7db609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f462b5a6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f45d9a1c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f45da97f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f4626794e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f462b7db609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f462b5a6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 22, last enqueued NCCL work: 23, last completed NCCL work: 21. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d3cca3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9d3df7cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9d3df81a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9d3df82dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f9d89a1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9d8ea62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9d8e82d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=22, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d3cca3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9d3df7cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9d3df81a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9d3df82dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f9d89a1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9d8ea62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9d8e82d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d3cca3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f9d3dc06119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f9d89a1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f9d8ea62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f9d8e82d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42d9445897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f42da71ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f42da723a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f42da724dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f43261bde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f432b204609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f432afcf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42d9445897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f42da71ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f42da723a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f42da724dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f43261bde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f432b204609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f432afcf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42d9445897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f42da3a8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f43261bde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f432b204609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f432afcf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank46]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank46]: grad_accumulator.backward(sum(activations)) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank46]: result = loss.backward() -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank46]: torch.autograd.backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank46]: _engine_run_backward( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank46]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank46]: return user_fn(self, *args) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank46]: self.grads_buffer.append(recv_grad()) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank41]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank41]: grad_accumulator.backward(sum(activations)) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank41]: result = loss.backward() -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank41]: torch.autograd.backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank41]: _engine_run_backward( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank41]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank41]: return user_fn(self, *args) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank41]: self.grads_buffer.append(recv_grad()) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank40]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank40]: grad_accumulator.backward(sum(activations)) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank40]: result = loss.backward() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank40]: torch.autograd.backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank40]: _engine_run_backward( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank40]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank40]: return user_fn(self, *args) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank40]: self.grads_buffer.append(recv_grad()) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank36]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank36]: grad_accumulator.backward(sum(activations)) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank36]: result = loss.backward() -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank36]: torch.autograd.backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank36]: _engine_run_backward( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank36]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank36]: return user_fn(self, *args) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank36]: self.grads_buffer.append(recv_grad()) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank32]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank32]: grad_accumulator.backward(sum(activations)) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank32]: result = loss.backward() -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank32]: torch.autograd.backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank32]: _engine_run_backward( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank32]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank32]: return user_fn(self, *args) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank32]: self.grads_buffer.append(recv_grad()) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank37]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank37]: grad_accumulator.backward(sum(activations)) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank37]: result = loss.backward() -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank37]: torch.autograd.backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank37]: _engine_run_backward( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank37]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank37]: return user_fn(self, *args) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank37]: self.grads_buffer.append(recv_grad()) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank42]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank42]: grad_accumulator.backward(sum(activations)) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank42]: result = loss.backward() -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank42]: torch.autograd.backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank42]: _engine_run_backward( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank42]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank42]: return user_fn(self, *args) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank42]: self.grads_buffer.append(recv_grad()) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank38]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank38]: grad_accumulator.backward(sum(activations)) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank38]: result = loss.backward() -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank38]: torch.autograd.backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank38]: _engine_run_backward( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank38]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank38]: return user_fn(self, *args) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank38]: self.grads_buffer.append(recv_grad()) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ff936d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0ffa646c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0ffa64ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0ffa64cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f10460e5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f104b12c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f104aef7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ff936d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0ffa646c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0ffa64ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0ffa64cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f10460e5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f104b12c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f104aef7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ff936d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f0ffa2d0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f10460e5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f104b12c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f104aef7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank45]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank45]: grad_accumulator.backward(sum(activations)) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank45]: result = loss.backward() -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank45]: torch.autograd.backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank45]: _engine_run_backward( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank45]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank45]: return user_fn(self, *args) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank45]: self.grads_buffer.append(recv_grad()) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank44]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank44]: grad_accumulator.backward(sum(activations)) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank44]: result = loss.backward() -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank44]: torch.autograd.backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank44]: _engine_run_backward( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank44]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank44]: return user_fn(self, *args) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank44]: self.grads_buffer.append(recv_grad()) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank34]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank34]: grad_accumulator.backward(sum(activations)) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank34]: result = loss.backward() -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank34]: torch.autograd.backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank34]: _engine_run_backward( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank34]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank34]: return user_fn(self, *args) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank34]: self.grads_buffer.append(recv_grad()) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: sharded_logits = self.model( -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a531b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3a54491c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3a54496a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3a54497dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3a9ff30e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3aa4f77609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3aa4d42353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a531b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3a54491c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3a54496a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3a54497dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3a9ff30e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3aa4f77609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3aa4d42353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a531b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3a5411b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f3a9ff30e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f3aa4f77609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f3aa4d42353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61d46b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f61d598bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f61d5990a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f61d5991dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f622142ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f6226471609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f622623c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61d46b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f61d598bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f61d5990a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f61d5991dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f622142ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f6226471609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f622623c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61d46b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f61d5615119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f622142ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f6226471609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f622623c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank33]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank33]: grad_accumulator.backward(sum(activations)) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank33]: result = loss.backward() -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank33]: torch.autograd.backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank33]: _engine_run_backward( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank33]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank33]: return user_fn(self, *args) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank33]: self.grads_buffer.append(recv_grad()) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3928bf9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3929ed2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3929ed7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3929ed8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3975971e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f397a9b8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f397a783353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3928bf9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3929ed2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3929ed7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3929ed8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3975971e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f397a9b8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f397a783353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3928bf9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f3929b5c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f3975971e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f397a9b8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f397a783353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c7e00c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7c7f2e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7c7f2eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7c7f2ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7ccad84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f7ccfdcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f7ccfb96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c7e00c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7c7f2e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7c7f2eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7c7f2ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7ccad84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f7ccfdcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f7ccfb96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c7e00c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f7c7ef6f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f7ccad84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f7ccfdcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f7ccfb96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2791c24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2792efdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2792f02a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2792f03dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f27de99ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f27e39e3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f27e37ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2791c24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2792efdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2792f02a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2792f03dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f27de99ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f27e39e3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f27e37ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2791c24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f2792b87119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f27de99ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f27e39e3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f27e37ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7292281897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f729355ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f729355fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7293560dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f72deff9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f72e4040609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f72e3e0b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7292281897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f729355ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f729355fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7293560dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f72deff9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f72e4040609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f72e3e0b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7292281897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f72931e4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f72deff9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f72e4040609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f72e3e0b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f697798b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6978c64c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6978c69a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6978c6adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f69c4703e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f69c974a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f69c9515353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f697798b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6978c64c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6978c69a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6978c6adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f69c4703e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f69c974a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f69c9515353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f697798b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f69788ee119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f69c4703e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f69c974a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f69c9515353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank35]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank35]: grad_accumulator.backward(sum(activations)) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank35]: result = loss.backward() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank35]: torch.autograd.backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank35]: _engine_run_backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank35]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank35]: return user_fn(self, *args) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank35]: self.grads_buffer.append(recv_grad()) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank39]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank39]: grad_accumulator.backward(sum(activations)) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank39]: result = loss.backward() -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank39]: torch.autograd.backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank39]: _engine_run_backward( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank39]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank39]: return user_fn(self, *args) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank39]: self.grads_buffer.append(recv_grad()) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa05e8d1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa05fbaac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa05fbafa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa05fbb0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa0ab649e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa0b0690609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa0b045b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa05e8d1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa05fbaac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa05fbafa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa05fbb0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa0ab649e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa0b0690609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa0b045b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa05e8d1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fa05f834119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fa0ab649e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fa0b0690609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fa0b045b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank43]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank43]: grad_accumulator.backward(sum(activations)) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank43]: result = loss.backward() -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank43]: torch.autograd.backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank43]: _engine_run_backward( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank43]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank43]: return user_fn(self, *args) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank43]: self.grads_buffer.append(recv_grad()) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank47]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank47]: grad_accumulator.backward(sum(activations)) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank47]: result = loss.backward() -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank47]: torch.autograd.backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank47]: _engine_run_backward( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank47]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank47]: return user_fn(self, *args) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank47]: self.grads_buffer.append(recv_grad()) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35d4f30897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f35d6209c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f35d620ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f35d620fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f3621ca8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3626cef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3626aba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35d4f30897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f35d6209c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f35d620ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f35d620fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f3621ca8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3626cef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3626aba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35d4f30897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f35d5e93119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f3621ca8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f3626cef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f3626aba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbc3cde4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbc3e0bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbc3e0c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbc3e0c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fbc89b5ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fbc8eba3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fbc8e96e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbc3cde4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbc3e0bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbc3e0c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbc3e0c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fbc89b5ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fbc8eba3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fbc8e96e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbc3cde4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fbc3dd47119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fbc89b5ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fbc8eba3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fbc8e96e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd2f5b94897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd2f6e6dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd2f6e72a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd2f6e73dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd34290ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd347953609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd34771e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd2f5b94897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd2f6e6dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd2f6e72a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd2f6e73dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd34290ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd347953609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd34771e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd2f5b94897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fd2f6af7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fd34290ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fd347953609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fd34771e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7febe0880897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7febe1b59c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7febe1b5ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7febe1b5fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fec2d5f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fec3263f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fec3240a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7febe0880897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7febe1b59c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7febe1b5ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7febe1b5fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fec2d5f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fec3263f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fec3240a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7febe0880897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7febe17e3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fec2d5f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fec3263f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fec3240a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c1ffd0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c212a9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c212aea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c212afdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3c6cd48e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3c71d8f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3c71b5a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c1ffd0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c212a9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c212aea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c212afdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3c6cd48e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3c71d8f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3c71b5a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c1ffd0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f3c20f33119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f3c6cd48e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f3c71d8f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f3c71b5a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd76ef7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd770255c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd77025aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd77025bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd7bbcf4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd7c0d3b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd7c0b06353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd76ef7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd770255c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd77025aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd77025bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd7bbcf4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd7c0d3b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd7c0b06353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd76ef7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd76fedf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd7bbcf4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd7c0d3b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd7c0b06353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f009a754897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f009ba2dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f009ba32a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f009ba33dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ea3ea3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4ea517cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4ea5181a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4ea5182dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f00e74cce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #4: + 0xd3e95 (0x7f4ef0c1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f00ec513609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #5: + 0x8609 (0x7f4ef5c62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default1]:frame #6: clone + 0x43 (0x7f00ec2de353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default2]:frame #6: clone + 0x43 (0x7f4ef5a2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1702f83897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f170425cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ea3ea3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f009a754897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1704261a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4ea517cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1704262dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f009ba2dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f174fcfbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4ea5181a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4ea5182dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4ef0c1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4ef5c62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f009ba32a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #5: + 0x8609 (0x7f1754d42609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1754b0d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f009ba33dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f00e74cce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: -[default2]:frame #6: clone + 0x43 (0x7f4ef5a2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default2]: -[default1]:frame #5: + 0x8609 (0x7f00ec513609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f00ec2de353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1702f83897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f009a754897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f009b6b7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f00e74cce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f00ec513609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ea3ea3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f170425cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1704261a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: clone + 0x43 (0x7f00ec2de353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1704262dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f174fcfbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #1: + 0xe32119 (0x7f4ea4e06119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f4ef0c1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f4ef5c62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f4ef5a2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #5: + 0x8609 (0x7f1754d42609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1754b0d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1702f83897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]: -[default4]:frame #1: + 0xe32119 (0x7f1703ee6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f174fcfbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f1754d42609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f1754b0d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc11a460897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc11b739c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc11b73ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc11b73fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc1671d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc16c21f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc16bfea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc11a460897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc11b739c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc11b73ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc11b73fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc1671d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc16c21f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc16bfea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc11a460897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fc11b3c3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fc1671d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fc16c21f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fc16bfea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 34, last enqueued NCCL work: 35, last completed NCCL work: 33. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1ec67a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1ed953c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f6e577897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1ed958a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1ed959dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fb2393f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fb23e439609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb23e204353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f6f850c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f6f855a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f6f856dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1ec67a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1ed953c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5fbb2efe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1ed958a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1ed959dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #5: + 0x8609 (0x7f5fc0336609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: + 0xd3e95 (0x7fb2393f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #6: clone + 0x43 (0x7f5fc0101353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:frame #5: + 0x8609 (0x7fb23e439609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb23e204353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=34, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]: -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f6e577897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f6f850c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f6f855a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f6f856dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5fbb2efe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1ec67a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fb1ed5dd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #5: + 0x8609 (0x7f5fc0336609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5fc0101353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #2: + 0xd3e95 (0x7fb2393f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]: -[default7]:frame #3: + 0x8609 (0x7fb23e439609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #4: clone + 0x43 (0x7fb23e204353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f6e577897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f5f6f4da119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f5fbb2efe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]: -[default5]:frame #3: + 0x8609 (0x7f5fc0336609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f5fc0101353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2bf929e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2bfa577c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2bfa57ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2bfa57ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2c46016e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2c4b05d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2c4ae28353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2bf929e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2bfa577c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2bfa57ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2bfa57ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2c46016e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2c4b05d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2c4ae28353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2bf929e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f2bfa201119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f2c46016e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f2c4b05d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f2c4ae28353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02076d8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02089b1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02089b6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02089b7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0254450e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0259497609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0259262353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02076d8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02089b1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02089b6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02089b7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0254450e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0259497609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0259262353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02076d8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f020863b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f0254450e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f0259497609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f0259262353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f57270a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f572837dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5728382a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5728383dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5773e1ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5778e63609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f5778c2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f57270a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f572837dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5728382a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5728383dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5773e1ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5778e63609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f5778c2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f57270a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f5728007119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f5773e1ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f5778e63609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f5778c2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4f00e40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4f02119c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4f0211ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4f0211fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f4f4dbb8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f4f52bff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4f529ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4f00e40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4f02119c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4f0211ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4f0211fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f4f4dbb8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f4f52bff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4f529ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4f00e40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f4f01da3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f4f4dbb8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f4f52bff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f4f529ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6ce285897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6cf55ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6cf563a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6cf564dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa71affde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa720044609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa71fe0f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6ce285897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6cf55ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6cf563a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6cf564dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa71affde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa720044609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa71fe0f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6ce285897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fa6cf1e8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fa71affde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fa720044609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fa71fe0f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f33b58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f34e31c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f34e36a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f34e37dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2f808d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2f85917609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2f856e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f33b58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f34e31c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f34e36a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f34e37dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2f808d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2f85917609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2f856e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f33b58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f2f34abb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f2f808d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f2f85917609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f2f856e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc362fd9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc3642b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc3642b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc3642b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fc3afd51e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fc3b4d98609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fc3b4b63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc362fd9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc3642b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc3642b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc3642b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fc3afd51e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fc3b4d98609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fc3b4b63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc362fd9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fc363f3c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fc3afd51e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fc3b4d98609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fc3b4b63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 3] Timeout at NCCL work: 14, last enqueued NCCL work: 15, last completed NCCL work: 13. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feadbb6c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feadce45c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feadce4aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feadce4bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7feb288e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7feb2d92b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7feb2d6f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=14, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feadbb6c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feadce45c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feadce4aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feadce4bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7feb288e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7feb2d92b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7feb2d6f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feadbb6c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7feadcacf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7feb288e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7feb2d92b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7feb2d6f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1747dfa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f17490d3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f17490d8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f17490d9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1794b72e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1799bb9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1799984353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1747dfa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f17490d3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f17490d8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f17490d9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1794b72e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1799bb9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1799984353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1747dfa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f1748d5d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f1794b72e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f1799bb9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f1799984353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf06169897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf07442c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf07447a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf07448dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fdf52ee1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fdf57f28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fdf57cf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf06169897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf07442c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf07447a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf07448dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fdf52ee1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fdf57f28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fdf57cf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf06169897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fdf070cc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fdf52ee1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fdf57f28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fdf57cf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13b1fdc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f13b32b5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f13b32baa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f13b32bbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f13fed54e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f1403d9b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f1403b66353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13b1fdc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f13b32b5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f13b32baa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f13b32bbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f13fed54e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f1403d9b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f1403b66353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13b1fdc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f13b2f3f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f13fed54e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f1403d9b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f1403b66353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8617d36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f861900fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8619014a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8619015dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8664aaee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8669af5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f86698c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8617d36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f861900fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8619014a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8619015dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8664aaee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8669af5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f86698c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8617d36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f8618c99119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f8664aaee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f8669af5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f86698c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35ab1df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f35ac4b8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f35ac4bda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f35ac4bedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f35f7f57e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f35fcf9e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f35fcd69353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35ab1df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f35ac4b8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f35ac4bda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f35ac4bedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f35f7f57e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f35fcf9e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f35fcd69353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35ab1df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f35ac142119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f35f7f57e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f35fcf9e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f35fcd69353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb0199ef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb01acc8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb01accda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb01accedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fb066767e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fb06b7ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fb06b579353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb0199ef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb01acc8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb01accda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb01accedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fb066767e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fb06b7ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fb06b579353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb0199ef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fb01a952119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fb066767e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fb06b7ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fb06b579353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd40bbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd41e98c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd41e9da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd41e9edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcd8d937e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcd9297e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcd92749353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd40bbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd41e98c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd41e9da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd41e9edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcd8d937e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcd9297e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcd92749353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd40bbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fcd41b22119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fcd8d937e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fcd9297e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fcd92749353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f32d514a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f32d6423c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f32d6428a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f32d6429dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f3321ec2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3326f09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3326cd4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f32d514a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f32d6423c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f32d6428a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f32d6429dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f3321ec2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3326f09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3326cd4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f32d514a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f32d60ad119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f3321ec2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f3326f09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f3326cd4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3090acd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3091da6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3091daba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3091dacdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f30dd845e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f30e288c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f30e2657353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3090acd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3091da6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3091daba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3091dacdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f30dd845e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f30e288c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f30e2657353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3090acd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f3091a30119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f30dd845e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f30e288c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f30e2657353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54986ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5499987c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f549998ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f549998ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f54e5426e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f54ea46d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f54ea238353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54986ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5499987c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f549998ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f549998ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f54e5426e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f54ea46d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f54ea238353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54986ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f5499611119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f54e5426e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f54ea46d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f54ea238353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb740132897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb74140bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb741410a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb741411dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fb78ceaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fb791ef1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb791cbc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb740132897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb74140bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb741410a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb741411dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fb78ceaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fb791ef1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb791cbc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb740132897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fb741095119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fb78ceaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fb791ef1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fb791cbc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdb0ec18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdb0fef1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdb0fef6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdb0fef7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fdb5b990e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fdb609d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fdb607a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdb0ec18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdb0fef1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdb0fef6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdb0fef7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fdb5b990e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fdb609d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fdb607a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdb0ec18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fdb0fb7b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fdb5b990e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fdb609d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fdb607a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3fc65ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fc7884c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fc7889a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fc788adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4013323e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f401836a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f4018135353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3fc65ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fc7884c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fc7889a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fc788adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4013323e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f401836a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f4018135353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3fc65ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f3fc750e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f4013323e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f401836a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f4018135353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcc2f14897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcc41edc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcc41f2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcc41f3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdd0fc8ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdd14cd3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdd14a9e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcc2f14897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcc41edc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcc41f2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcc41f3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdd0fc8ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdd14cd3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdd14a9e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcc2f14897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fdcc3e77119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fdd0fc8ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fdd14cd3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fdd14a9e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c5ed45897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7c6001ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7c60023a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7c60024dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7cababde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f7cb0b04609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f7cb08cf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c5ed45897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7c6001ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7c60023a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7c60024dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7cababde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f7cb0b04609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f7cb08cf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c5ed45897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f7c5fca8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f7cababde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f7cb0b04609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f7cb08cf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 2] Timeout at NCCL work: 26, last enqueued NCCL work: 27, last completed NCCL work: 25. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f53b9b60897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f53bae39c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f53bae3ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f53bae3fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f54068d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f540b91f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f540b6ea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=26, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f53b9b60897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f53bae39c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f53bae3ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f53bae3fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f54068d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f540b91f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f540b6ea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f53b9b60897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f53baac3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f54068d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f540b91f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f540b6ea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -W0703 06:03:40.100000 139733068760896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3244602 closing signal SIGTERM -W0703 06:03:40.100000 139733068760896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3244603 closing signal SIGTERM -W0703 06:03:40.100000 139733068760896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3244604 closing signal SIGTERM -W0703 06:03:40.100000 139733068760896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3244606 closing signal SIGTERM -W0703 06:03:40.100000 139733068760896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3244607 closing signal SIGTERM -W0703 06:03:40.100000 139733068760896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3244608 closing signal SIGTERM -W0703 06:03:40.100000 139733068760896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3244609 closing signal SIGTERM -W0703 06:03:40.111000 139881489946432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 928503 closing signal SIGTERM -W0703 06:03:40.111000 139881489946432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 928505 closing signal SIGTERM -W0703 06:03:40.111000 139881489946432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 928507 closing signal SIGTERM -W0703 06:03:40.111000 139881489946432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 928508 closing signal SIGTERM -W0703 06:03:40.112000 139881489946432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 928509 closing signal SIGTERM -W0703 06:03:40.121000 139911714207552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 549458 closing signal SIGTERM -W0703 06:03:40.122000 139911714207552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 549459 closing signal SIGTERM -W0703 06:03:40.122000 139911714207552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 549461 closing signal SIGTERM -W0703 06:03:40.122000 139911714207552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 549462 closing signal SIGTERM -W0703 06:03:40.122000 139911714207552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 549463 closing signal SIGTERM -W0703 06:03:40.122000 139911714207552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 549464 closing signal SIGTERM -W0703 06:03:40.122000 139911714207552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 549465 closing signal SIGTERM -W0703 06:03:40.140000 139952964577088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 798569 closing signal SIGTERM -W0703 06:03:40.141000 139952964577088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 798570 closing signal SIGTERM -W0703 06:03:40.141000 139952964577088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 798571 closing signal SIGTERM -W0703 06:03:40.141000 139952964577088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 798572 closing signal SIGTERM -W0703 06:03:40.141000 139952964577088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 798573 closing signal SIGTERM -W0703 06:03:40.141000 139952964577088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 798575 closing signal SIGTERM -W0703 06:03:40.141000 139952964577088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 798576 closing signal SIGTERM -E0703 06:03:40.338000 140035837380416 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1164132) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:03:40 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1164133) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1164133 -[2]: - time : 2024-07-03_06:03:40 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1164134) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1164134 -[3]: - time : 2024-07-03_06:03:40 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1164135) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1164135 -[4]: - time : 2024-07-03_06:03:40 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1164136) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1164136 -[5]: - time : 2024-07-03_06:03:40 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1164137) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1164137 -[6]: - time : 2024-07-03_06:03:40 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1164138) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1164138 -[7]: - time : 2024-07-03_06:03:40 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1164139) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1164139 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:03:40 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1164132) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1164132 -============================================================ -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -E0703 06:03:41.418000 139881489946432 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 928504) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:03:41.536000 139881489946432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_928430_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:41.565000 139881489946432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_928430_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:41.579000 139881489946432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_928430_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:03:40 - host : ip-26-0-172-73.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 928506) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 928506 -[2]: - time : 2024-07-03_06:03:40 - host : ip-26-0-172-73.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 928510) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 928510 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:03:40 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 928504) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 928504 -============================================================ -E0703 06:03:41.819000 139952964577088 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 5 (pid: 798574) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:03:41.834000 139952964577088 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_798496_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:41.863000 139952964577088 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_798496_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:41.871000 139952964577088 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_798496_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:03:40 - host : ip-26-0-163-220.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 798574) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 798574 -============================================================ -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -E0703 06:03:42.416000 139911714207552 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 549460) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 06:03:42.420000 139733068760896 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 3 (pid: 3244605) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:03:42.433000 139911714207552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_549386_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:42.434000 139733068760896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3244529_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:42.461000 139911714207552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_549386_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:42.469000 139911714207552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_549386_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:42.468000 139733068760896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3244529_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:03:40 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 549460) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 549460 -============================================================ -W0703 06:03:42.476000 139733068760896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3244529_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:03:40 - host : ip-26-0-163-226.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 3244605) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3244605 -============================================================ -srun: error: ip-26-0-163-220: task 2: Exited with exit code 1 -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 3: Exited with exit code 1 -W0703 06:03:44.196000 140565917730560 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1887177_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:44.681000 139882941482752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1087071_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:44.780000 140644081374976 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1860224_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 06:03:45.238000 140571578464064 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1887251) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 06:03:45.241000 140649742108480 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1860297) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 06:03:45.254000 139888602216256 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1087144) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:03:45.252000 140571578464064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1887177_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:45.253000 140649742108480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1860224_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:45.266000 139888602216256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1087071_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:45.282000 140571578464064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1887177_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:45.282000 140649742108480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1860224_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:45.298000 139888602216256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1087071_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:03:45.313000 140571578464064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1887177_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:03:45 - host : ip-26-0-168-238.ec2.internal - rank : 33 (local_rank: 1) - exitcode : -6 (pid: 1887252) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1887252 -[2]: - time : 2024-07-03_06:03:45 - host : ip-26-0-168-238.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 1887253) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1887253 -[3]: - time : 2024-07-03_06:03:45 - host : ip-26-0-168-238.ec2.internal - rank : 35 (local_rank: 3) - exitcode : -6 (pid: 1887254) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1887254 -[4]: - time : 2024-07-03_06:03:45 - host : ip-26-0-168-238.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 1887255) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1887255 -[5]: - time : 2024-07-03_06:03:45 - host : ip-26-0-168-238.ec2.internal - rank : 37 (local_rank: 5) - exitcode : -6 (pid: 1887256) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1887256 -[6]: - time : 2024-07-03_06:03:45 - host : ip-26-0-168-238.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 1887257) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1887257 -[7]: - time : 2024-07-03_06:03:45 - host : ip-26-0-168-238.ec2.internal - rank : 39 (local_rank: 7) - exitcode : -6 (pid: 1887258) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1887258 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:03:45 - host : ip-26-0-168-238.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 1887251) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1887251 -============================================================ -W0703 06:03:45.320000 140649742108480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1860224_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -W0703 06:03:45.327000 139888602216256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1087071_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:03:45 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 1860298) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1860298 -[2]: - time : 2024-07-03_06:03:45 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 1860299) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1860299 -[3]: - time : 2024-07-03_06:03:45 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 1860300) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1860300 -[4]: - time : 2024-07-03_06:03:45 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 1860301) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1860301 -[5]: - time : 2024-07-03_06:03:45 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 1860302) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1860302 -[6]: - time : 2024-07-03_06:03:45 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 1860303) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1860303 -[7]: - time : 2024-07-03_06:03:45 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 1860304) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1860304 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:03:45 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 1860297) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1860297 -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:03:45 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : -6 (pid: 1087145) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1087145 -[2]: - time : 2024-07-03_06:03:45 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : -6 (pid: 1087146) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1087146 -[3]: - time : 2024-07-03_06:03:45 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : -6 (pid: 1087147) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1087147 -[4]: - time : 2024-07-03_06:03:45 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 1087148) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1087148 -[5]: - time : 2024-07-03_06:03:45 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : -6 (pid: 1087149) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1087149 -[6]: - time : 2024-07-03_06:03:45 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 1087150) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1087150 -[7]: - time : 2024-07-03_06:03:45 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : -6 (pid: 1087151) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1087151 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:03:45 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 1087144) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1087144 -============================================================ -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/status.txt b/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-16_pp-4_mbz-8/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/bench.slurm deleted file mode 100644 index f201b25e6fed34b4fa220d4d5e88edb9b3bc66d6..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/config.yaml deleted file mode 100644 index 98be0edf3d694e640de493b98bbd8533be2e7db5..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 1024 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 1 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/log.out deleted file mode 100644 index 94e3e64e5602c2a9e3ca8983e0dffbd57dbfacab..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/log.out +++ /dev/null @@ -1,5750 +0,0 @@ -======================== -START TIME: Wed Jul 3 07:03:12 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 07:03:18.043000 139791967815488 torch/distributed/run.py:757] -W0703 07:03:18.043000 139791967815488 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.043000 139791967815488 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:03:18.043000 139791967815488 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.102000 140232542312256 torch/distributed/run.py:757] -W0703 07:03:18.102000 140232542312256 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.102000 140232542312256 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:03:18.102000 140232542312256 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.805000 140544871081792 torch/distributed/run.py:757] -W0703 07:03:18.805000 140544871081792 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.805000 140544871081792 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:03:18.805000 140544871081792 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.819000 139651878709056 torch/distributed/run.py:757] -W0703 07:03:18.819000 139651878709056 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.819000 139651878709056 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:03:18.819000 139651878709056 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.837000 139792313845568 torch/distributed/run.py:757] -W0703 07:03:18.837000 139792313845568 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.837000 139792313845568 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:03:18.837000 139792313845568 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.906000 140574532601664 torch/distributed/run.py:757] -W0703 07:03:18.906000 140574532601664 torch/distributed/run.py:757] ***************************************** -W0703 07:03:18.906000 140574532601664 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:03:18.906000 140574532601664 torch/distributed/run.py:757] ***************************************** -W0703 07:03:19.347000 139999424665408 torch/distributed/run.py:757] -W0703 07:03:19.347000 139999424665408 torch/distributed/run.py:757] ***************************************** -W0703 07:03:19.347000 139999424665408 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:03:19.347000 139999424665408 torch/distributed/run.py:757] ***************************************** -W0703 07:03:19.349000 140368660313920 torch/distributed/run.py:757] -W0703 07:03:19.349000 140368660313920 torch/distributed/run.py:757] ***************************************** -W0703 07:03:19.349000 140368660313920 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:03:19.349000 140368660313920 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 07:03:44 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=2, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=32, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=1024, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1')), -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 07:03:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: No checkpoint path provided. -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=29|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=29|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=29|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=31|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=31|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=31|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=26|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=26|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=30|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=30|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=30|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=26|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=28|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=28|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=28|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=24|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=24|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=24|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=27|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=27|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=27|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=21|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=21|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=21|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=16|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=16|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=16|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=25|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=25|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=25|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=19|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=19|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=19|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=22|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=22|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=17|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=17|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=22|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=23|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=23|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=17|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=18|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=18|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=20|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=23|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=18|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=20|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=20|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: No checkpoint path provided. -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: No checkpoint path provided. -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 07:04:03 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 07:04:03 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 07:04:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 07:04:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 07:04:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 07:04:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 07:04:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 07:04:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 07:04:06 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:04:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 07:04:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 07:04:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 07:04:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 07:04:08.565269 | mbs: 1 | grad_accum: 1024 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 07:04:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 07:04:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=30|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=24|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=19|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=16|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=22|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=31|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=27|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=26|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=28|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=30|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=29|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=26|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=31|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=27|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=24|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=16|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=22|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=18|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=17|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=23|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=17|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=20|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=21|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=18|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=23|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=25|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=29|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=28|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:04:08 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=21|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=25|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=19|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=20|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:04:08 [WARNING|DP=0|PP=1|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:04:13 [WARNING|DP=0|PP=1|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank24]: dist.recv( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank6]: dist.recv( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default4]:[rank20]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank26]: dist.recv( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/para[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_sllel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -tate_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_ran[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/sitek)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-pa-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -ckages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank19]: dist.recv( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7e448e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7e45bc0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7e45bc5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7e45bc6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7e9165fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f7e966a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7e96471353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7e448e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7e45bc0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7e45bc5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7e45bc6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7e9165fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f7e966a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7e96471353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7e448e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f7e4584a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f7e9165fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f7e966a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f7e96471353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe277bc5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe278e9ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe278ea3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe278ea4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fe2c493de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fe2c9984609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fe2c974f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe277bc5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe278e9ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe278ea3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe278ea4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fe2c493de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fe2c9984609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fe2c974f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe277bc5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fe278b28119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fe2c493de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fe2c9984609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fe2c974f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b17457897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b18730c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b18735a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b18736dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2b641cfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2b69216609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2b68fe1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b17457897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b18730c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b18735a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b18736dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2b641cfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2b69216609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2b68fe1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b17457897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f2b183ba119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f2b641cfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f2b69216609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f2b68fe1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbeb349897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efbec622c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efbec627a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efbec628dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7efc380c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7efc3d108609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7efc3ced3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbeb349897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efbec622c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efbec627a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efbec628dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7efc380c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7efc3d108609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7efc3ced3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbeb349897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7efbec2ac119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7efc380c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7efc3d108609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7efc3ced3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f697bbb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f697ce8bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f697ce90a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f697ce91dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f69c892ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f69cd971609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f69cd73c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f697bbb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f697ce8bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f697ce90a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f697ce91dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f69c892ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f69cd971609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f69cd73c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f697bbb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f697cb15119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f69c892ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f69cd971609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f69cd73c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7a4123897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd7a53fcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd7a5401a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd7a5402dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd7f0e9be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd7f5ee2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd7f5cad353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7a4123897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd7a53fcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd7a5401a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd7a5402dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd7f0e9be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd7f5ee2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd7f5cad353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7a4123897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fd7a5086119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fd7f0e9be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fd7f5ee2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fd7f5cad353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea39d8c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea3b065c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea3b06aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea3b06bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fea86b04e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fea8bb4b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fea8b916353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea39d8c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea3b065c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea3b06aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea3b06bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fea86b04e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fea8bb4b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fea8b916353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea39d8c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fea3acef119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fea86b04e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fea8bb4b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fea8b916353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f122f58c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1230865c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f123086aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f123086bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f127c304e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f128134b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1281116353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f122f58c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1230865c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f123086aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f123086bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f127c304e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f128134b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1281116353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f122f58c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f12304ef119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f127c304e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f128134b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f1281116353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3fcbee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff3fdec7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff3fdecca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff3fdecddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7ff449966e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7ff44e9ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7ff44e778353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3fcbee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff3fdec7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff3fdecca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff3fdecddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7ff449966e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7ff44e9ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7ff44e778353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3fcbee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7ff3fdb51119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7ff449966e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7ff44e9ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7ff44e778353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6e5c6fa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6e5d9d3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6e5d9d8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6e5d9d9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f6ea9472e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f6eae4b9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f6eae284353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6e5c6fa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6e5d9d3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6e5d9d8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6e5d9d9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f6ea9472e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f6eae4b9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f6eae284353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6e5c6fa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f6e5d65d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f6ea9472e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f6eae4b9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f6eae284353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3047e22897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f30490fbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3049100a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3049101dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3094b9ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3099be1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f30999ac353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3047e22897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f30490fbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3049100a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3049101dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3094b9ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3099be1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f30999ac353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3047e22897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3048d85119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f3094b9ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f3099be1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f30999ac353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default7]:[rank31]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default7]:[rank31]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default6]:[rank30]: dist.recv( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4b32b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb4b4589c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb4b458ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb4b458fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb500028e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb50506f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb504e3a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4b32b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb4b4589c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb4b458ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb4b458fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb500028e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb50506f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb504e3a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4b32b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fb4b4213119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb500028e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb50506f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb504e3a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3e4cfd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb3e5fd6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb3e5fdba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb3e5fdcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fb431a75e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fb436abc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb436887353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3e4cfd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb3e5fd6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb3e5fdba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb3e5fdcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fb431a75e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fb436abc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb436887353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3e4cfd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fb3e5c60119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fb431a75e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fb436abc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fb436887353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea0500c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea062e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea062eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea062ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fea51d84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fea56dcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fea56b96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea0500c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea062e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea062eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea062ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fea51d84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fea56dcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fea56b96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea0500c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fea05f6f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fea51d84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fea56dcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fea56b96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f20c5398897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f20c6671c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f20c6676a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f20c6677dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2112110e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2117157609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2116f22353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f20c5398897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f20c6671c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f20c6676a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f20c6677dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2112110e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2117157609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2116f22353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f20c5398897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f20c62fb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f2112110e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f2117157609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f2116f22353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3568e15897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f356a0eec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f356a0f3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f356a0f4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f35b5b8de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f35babd4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f35ba99f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3568e15897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f356a0eec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f356a0f3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f356a0f4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f35b5b8de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f35babd4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f35ba99f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3568e15897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f3569d78119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f35b5b8de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f35babd4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f35ba99f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2d0f9f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff2d2278c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff2d227da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff2d227edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff31dd17e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff322d5e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff322b29353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2d0f9f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff2d2278c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff2d227da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff2d227edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff31dd17e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff322d5e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff322b29353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2d0f9f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7ff2d1f02119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7ff31dd17e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7ff322d5e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7ff322b29353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0e00ea3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0e0217cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0e02181a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0e02182dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f0e4dc1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:frame #5: + 0x8609 (0x7f0e52c62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f0e52a2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0e00ea3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0e0217cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0e02181a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0e02182dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f0e4dc1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f0e52c62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f0e52a2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0e00ea3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f0e01e06119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f0e4dc1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f0e52c62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f0e52a2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa7d56d3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa7d69acc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa7d69b1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa7d69b2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa82244be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa827492609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa82725d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa7d56d3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa7d69acc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa7d69b1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa7d69b2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa82244be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa827492609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa82725d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa7d56d3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fa7d6636119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fa82244be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fa827492609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fa82725d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc1d355e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcaec714897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc1d4837c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f631b815897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcaed9edc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f631caeec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcaed9f2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcaed9f3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f631caf3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fcb3948ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f631caf4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc1d483ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #5: + 0x8609 (0x7fcb3e4d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc1d483ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f636858de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #6: clone + 0x43 (0x7fcb3e29e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #5: + 0x8609 (0x7f636d5d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]: -[default7]:frame #6: clone + 0x43 (0x7f636d39f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #4: + 0xd3e95 (0x7fc2202d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #5: + 0x8609 (0x7fc22531d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]: -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default2]:frame #6: clone + 0x43 (0x7fc2250e8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a41749897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2a42a22c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2a42a27a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2a42a28dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2a8e4c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2a93508609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2a932d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a41749897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2a42a22c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2a42a27a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2a42a28dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2a8e4c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2a93508609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2a932d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a41749897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f2a426ac119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f2a8e4c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f2a93508609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f2a932d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f845cc2e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f845df07c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f845df0ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f845df0ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f84a99a6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f84ae9ed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f84ae7b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f845cc2e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f845df07c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f845df0ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f845df0ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f84a99a6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f84ae9ed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f84ae7b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f845cc2e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f845db91119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f84a99a6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f84ae9ed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f84ae7b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b344a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b35779c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b3577ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b3577fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:frame #4: + 0xd3e95 (0x7f1b81218e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fae581f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fae594d1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fae594d6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7f1b8625f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fae594d7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #6: clone + 0x43 (0x7f1b8602a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #4: + 0xd3e95 (0x7faea4f70e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7faea9fb7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b344a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b35779c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b3577ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b3577fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #6: clone + 0x43 (0x7faea9d82353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #4: + 0xd3e95 (0x7f1b81218e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]: -[default6]:frame #5: + 0x8609 (0x7f1b8625f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:frame #6: clone + 0x43 (0x7f1b8602a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b344a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fae581f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fae594d1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fae594d6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fae594d7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7faea4f70e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7faea9fb7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7faea9d82353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fae581f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f1b35403119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #1: + 0xe32119 (0x7fae5915b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7faea4f70e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #2: + 0xd3e95 (0x7f1b81218e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7faea9fb7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7faea9d82353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:frame #3: + 0x8609 (0x7f1b8625f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f1b8602a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16bee43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f16c011cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f16c0121a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f16c0122dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f170bbbbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:frame #5: + 0x8609 (0x7f1710c02609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f17109cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16bee43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f16c011cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f16c0121a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f16c0122dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f170bbbbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87bbf36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f87bd20fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f87bd214a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f87bd215dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f8808caee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f880dcf5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f880dac0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default7]:frame #5: + 0x8609 (0x7f1710c02609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #6: clone + 0x43 (0x7f17109cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87bbf36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f87bd20fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f87bd214a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16bee43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f87bd215dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #1: + 0xe32119 (0x7f16bfda6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f8808caee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #2: + 0xd3e95 (0x7f170bbbbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f880dcf5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f880dac0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #3: + 0x8609 (0x7f1710c02609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f17109cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87bbf36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f87bce99119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f8808caee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f880dcf5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f880dac0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcaec714897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]: -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcaed9edc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcaed9f2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f631b815897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcaed9f3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f631caeec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fcb3948ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #5: + 0x8609 (0x7fcb3e4d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f631caf3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc1d355e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #6: clone + 0x43 (0x7fcb3e29e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc1d4837c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]: -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f631caf4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc1d483ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f636858de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcaec714897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc1d483ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #5: + 0x8609 (0x7f636d5d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: + 0xd3e95 (0x7fc2202d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc22531d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f636d39f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #1: + 0xe32119 (0x7fcaed677119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #6: clone + 0x43 (0x7fc2250e8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #2: + 0xd3e95 (0x7fcb3948ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #3: + 0x8609 (0x7fcb3e4d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #4: clone + 0x43 (0x7fcb3e29e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc1d355e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]: -[default2]:frame #1: + 0xe32119 (0x7fc1d44c1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f631b815897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #2: + 0xd3e95 (0x7fc2202d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fc22531d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fc2250e8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:frame #1: + 0xe32119 (0x7f631c778119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f636858de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f636d5d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f636d39f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f941457d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9415856c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f941585ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f941585cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f94612f5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f946633c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f9466107353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f941457d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9415856c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f941585ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f941585cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f94612f5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f946633c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f9466107353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f941457d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f94154e0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f94612f5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f946633c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f9466107353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9531054897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f953232dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9532332a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9532333dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f957ddcce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f9582e13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f9582bde353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9531054897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f953232dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9532332a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9532333dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f957ddcce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f9582e13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f9582bde353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9531054897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f9531fb7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f957ddcce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f9582e13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f9582bde353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05b51db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05b64b4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05b64b9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05b64badcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f0601f53e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0606f9a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0606d65353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05b51db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05b64b4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05b64b9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05b64badcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f0601f53e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0606f9a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0606d65353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05b51db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f05b613e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f0601f53e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f0606f9a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f0606d65353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe92701a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe9282f3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe9282f8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe9282f9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe973d92e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe978dd9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe978ba4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe92701a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe9282f3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe9282f8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe9282f9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe973d92e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe978dd9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe978ba4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe92701a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fe927f7d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fe973d92e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fe978dd9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fe978ba4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42ab09d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f42ac376c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f42ac37ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f42ac37cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f42f7e15e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f42fce5c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f42fcc27353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42ab09d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f42ac376c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f42ac37ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f42ac37cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f42f7e15e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f42fce5c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f42fcc27353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42ab09d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f42ac000119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f42f7e15e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f42fce5c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f42fcc27353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f08b9ed7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f08bb1b0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f08bb1b5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f08bb1b6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f0906c4fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f090bc96609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f090ba61353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f08b9ed7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f08bb1b0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f08bb1b5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f08bb1b6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f0906c4fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f090bc96609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f090ba61353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f08b9ed7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f08bae3a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f0906c4fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f090bc96609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f090ba61353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1a652f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1a665d1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1a665d6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1a665d7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1ab2070e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1ab70b7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1ab6e82353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1a652f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1a665d1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1a665d6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1a665d7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1ab2070e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1ab70b7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1ab6e82353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1a652f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f1a6625b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f1ab2070e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f1ab70b7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f1ab6e82353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdfebdac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdfed085c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdfed08aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdfed08bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fe038b24e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fe03db6b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fe03d936353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdfebdac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdfed085c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdfed08aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdfed08bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fe038b24e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fe03db6b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fe03d936353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdfebdac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fdfecd0f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fe038b24e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fe03db6b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fe03d936353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fecefa76897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fecf0d4fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fecf0d54a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fecf0d55dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fed3c7eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fed41835609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fed41600353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fecefa76897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fecf0d4fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fecf0d54a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fecf0d55dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fed3c7eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fed41835609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fed41600353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fecefa76897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fecf09d9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fed3c7eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fed41835609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fed41600353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f17cad02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f17cbfdbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f17cbfe0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f17cbfe1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1817a7ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f181cac1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f181c88c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f17cad02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f17cbfdbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f17cbfe0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f17cbfe1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1817a7ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f181cac1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f181c88c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f17cad02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f17cbc65119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f1817a7ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f181cac1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f181c88c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc508fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdc51bd6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdc51bdba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdc51bdcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fdc9d675e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fdca26bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fdca2487353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc508fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdc51bd6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdc51bdba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdc51bdcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fdc9d675e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fdca26bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fdca2487353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc508fd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fdc51860119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fdc9d675e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fdca26bc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fdca2487353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15a061c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f15a18f5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f15a18faa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f15a18fbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f15ed394e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f15f23db609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f15f21a6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15a061c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f15a18f5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f15a18faa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f15a18fbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f15ed394e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f15f23db609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f15f21a6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15a061c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f15a157f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f15ed394e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f15f23db609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f15f21a6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8cf0d93897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8cf206cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8cf2071a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8cf2072dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8d3db0be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8d42b52609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8d4291d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8cf0d93897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8cf206cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8cf2071a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8cf2072dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8d3db0be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8d42b52609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8d4291d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8cf0d93897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f8cf1cf6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f8d3db0be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f8d42b52609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f8d4291d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc398727897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc399a00c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc399a05a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc399a06dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc3e549fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc3ea4e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc3ea2b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc398727897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc399a00c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc399a05a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc399a06dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc3e549fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc3ea4e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc3ea2b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc398727897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fc39968a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fc3e549fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fc3ea4e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fc3ea2b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1dace4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1dae124c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1dae129a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1dae12adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1df9bc3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1dfec0a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1dfe9d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1dace4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1dae124c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1dae129a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1dae12adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1df9bc3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1dfec0a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1dfe9d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1dace4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f1daddae119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f1df9bc3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f1dfec0a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f1dfe9d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35c5c42897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f35c6f1bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f35c6f20a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f35c6f21dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f36129bae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3617a01609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f36177cc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35c5c42897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f35c6f1bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f35c6f20a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f35c6f21dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f36129bae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3617a01609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f36177cc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35c5c42897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f35c6ba5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f36129bae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f3617a01609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f36177cc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]: -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f386459d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3865876c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f386587ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f386587cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f38b1315e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f38b635c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f38b6127353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f386459d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3865876c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f386587ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f386587cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f38b1315e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f38b635c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f38b6127353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f386459d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f3865500119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f38b1315e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f38b635c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f38b6127353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b5b780897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5b5ca59c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5b5ca5ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5b5ca5fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5ba84f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5bad53f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5bad30a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b5b780897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5b5ca59c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5b5ca5ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5b5ca5fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5ba84f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5bad53f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5bad30a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b5b780897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f5b5c6e3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f5ba84f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f5bad53f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f5bad30a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63903bc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6391695c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f639169aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f639169bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f63dd134e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f63e217b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f63e1f46353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63903bc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6391695c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f639169aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f639169bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f63dd134e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f63e217b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f63e1f46353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63903bc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f639131f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f63dd134e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f63e217b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f63e1f46353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7ca588897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff7cb861c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff7cb866a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff7cb867dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff817300e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff81c347609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff81c112353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7ca588897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff7cb861c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff7cb866a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff7cb867dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff817300e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff81c347609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff81c112353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7ca588897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ff7cb4eb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff817300e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff81c347609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ff81c112353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9bd37f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9bd4acfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9bd4ad4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9bd4ad5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f9c2056ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f9c255b5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f9c25380353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9bd37f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9bd4acfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9bd4ad4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9bd4ad5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f9c2056ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f9c255b5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f9c25380353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9bd37f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f9bd4759119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f9c2056ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f9c255b5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f9c25380353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcbe208897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcbf4e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcbf4e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcbf4e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fdd0af80e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fdd0ffc7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fdd0fd92353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcbe208897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcbf4e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcbf4e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcbf4e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fdd0af80e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fdd0ffc7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fdd0fd92353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcbe208897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fdcbf16b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fdd0af80e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fdd0ffc7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fdd0fd92353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f22f9f77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f22fb250c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f22fb255a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f22fb256dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2346cefe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f234bd36609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f234bb01353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f22f9f77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f22fb250c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f22fb255a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f22fb256dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2346cefe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f234bd36609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f234bb01353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f22f9f77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f22faeda119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f2346cefe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f234bd36609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f234bb01353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f203ae0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f203c0e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f203c0e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f203c0e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2087b82e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f208cbc9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f208c994353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f203ae0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f203c0e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f203c0e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f203c0e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2087b82e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f208cbc9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f208c994353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f203ae0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f203bd6d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f2087b82e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f208cbc9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f208c994353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c98dbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c9a097c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c9a09ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c9a09ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4ce5b36e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4ceab7d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4cea948353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c98dbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c9a097c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c9a09ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c9a09ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4ce5b36e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4ceab7d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4cea948353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c98dbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4c99d21119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f4ce5b36e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f4ceab7d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4cea948353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61e4dcd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f61e60a6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f61e60aba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f61e60acdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6231b45e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6236b8c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6236957353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61e4dcd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f61e60a6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f61e60aba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f61e60acdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6231b45e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6236b8c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6236957353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61e4dcd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f61e5d30119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f6231b45e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f6236b8c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f6236957353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f003fd0f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0040fe8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0040feda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0040feedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f008ca87e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0091ace609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0091899353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f003fd0f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0040fe8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0040feda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0040feedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f008ca87e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0091ace609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0091899353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f003fd0f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f0040c72119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f008ca87e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f0091ace609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f0091899353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f38ccc70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f38cdf49c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f38cdf4ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f38cdf4fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f39199e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f391ea2f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f391e7fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f38ccc70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f38cdf49c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f38cdf4ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f38cdf4fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f39199e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f391ea2f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f391e7fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f38ccc70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f38cdbd3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f39199e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f391ea2f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f391e7fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84ec0a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84ed379c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84ed37ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84ed37fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8538e18e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f853de5f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f853dc2a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84ec0a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84ed379c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84ed37ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84ed37fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8538e18e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f853de5f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f853dc2a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84ec0a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f84ed003119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f8538e18e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f853de5f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f853dc2a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc09e169897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc09f442c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc09f447a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc09f448dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fc0eaee1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fc0eff28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fc0efcf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc09e169897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc09f442c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc09f447a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc09f448dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fc0eaee1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fc0eff28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fc0efcf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc09e169897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fc09f0cc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fc0eaee1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fc0eff28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fc0efcf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcf66fde897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcf682b7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcf682bca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcf682bddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcfb3d56e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcfb8d9d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcfb8b68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcf66fde897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcf682b7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcf682bca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcf682bddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcfb3d56e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcfb8d9d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcfb8b68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcf66fde897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fcf67f41119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fcfb3d56e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fcfb8d9d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fcfb8b68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21986b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2199990c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2199995a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2199996dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f21e542fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f21ea476609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f21ea241353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21986b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2199990c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2199995a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2199996dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f21e542fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f21ea476609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f21ea241353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21986b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f219961a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f21e542fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f21ea476609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f21ea241353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc5fd15a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc5fe433c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc5fe438a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc5fe439dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc649ed2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc64ef19609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc64ece4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc5fd15a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc5fe433c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc5fe438a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc5fe439dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc649ed2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc64ef19609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc64ece4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc5fd15a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fc5fe0bd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fc649ed2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fc64ef19609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fc64ece4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f12ca5a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f12cb87dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f12cb882a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f12cb883dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f131731ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f131c363609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f131c12e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f12ca5a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f12cb87dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f12cb882a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f12cb883dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f131731ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f131c363609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f131c12e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f12ca5a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f12cb507119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f131731ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f131c363609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f131c12e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcdb4073897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcdb534cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcdb5351a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcdb5352dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fce00debe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fce05e32609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fce05bfd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcdb4073897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcdb534cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcdb5351a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcdb5352dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fce00debe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fce05e32609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fce05bfd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcdb4073897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fcdb4fd6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fce00debe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fce05e32609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fce05bfd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9a145d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd9a2736c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd9a273ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd9a273cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd9ee1d5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd9f321c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd9f2fe7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=262144, NumelOut=262144, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9a145d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd9a2736c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd9a273ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd9a273cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd9ee1d5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd9f321c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd9f2fe7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9a145d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fd9a23c0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fd9ee1d5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fd9f321c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fd9f2fe7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -W0703 07:14:31.195000 139791967815488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1177002 closing signal SIGTERM -W0703 07:14:31.195000 139791967815488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1177003 closing signal SIGTERM -W0703 07:14:31.195000 139791967815488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1177004 closing signal SIGTERM -W0703 07:14:31.195000 139791967815488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1177005 closing signal SIGTERM -W0703 07:14:31.195000 139791967815488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1177006 closing signal SIGTERM -W0703 07:14:31.195000 139791967815488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1177007 closing signal SIGTERM -W0703 07:14:31.195000 139791967815488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1177009 closing signal SIGTERM -W0703 07:14:31.205000 139651878709056 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 941953 closing signal SIGTERM -W0703 07:14:31.205000 139651878709056 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 941955 closing signal SIGTERM -W0703 07:14:31.205000 139651878709056 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 941956 closing signal SIGTERM -W0703 07:14:31.206000 139651878709056 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 941957 closing signal SIGTERM -W0703 07:14:31.206000 139651878709056 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 941958 closing signal SIGTERM -W0703 07:14:31.206000 139651878709056 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 941960 closing signal SIGTERM -E0703 07:14:32.547000 139651878709056 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 941954) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_07:14:31 - host : ip-26-0-172-73.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 941959) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 941959 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:14:31 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 941954) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 941954 -============================================================ -E0703 07:14:33.010000 139791967815488 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 6 (pid: 1177008) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:14:31 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1177008) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1177008 -============================================================ -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -W0703 07:14:35.216000 139993763931904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1872628_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:35.409000 139786653112064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-178.ec2.internal_562922_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:35.514000 140539210348288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1100021_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:35.516000 140568871868160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-220.ec2.internal_810759_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:35.804000 140226881578752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3256690_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:36.013000 140362999580416 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1899559_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:36.200000 140544871081792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1100094 closing signal SIGTERM -W0703 07:14:36.200000 140544871081792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1100095 closing signal SIGTERM -W0703 07:14:36.200000 140544871081792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1100097 closing signal SIGTERM -W0703 07:14:36.200000 140544871081792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1100099 closing signal SIGTERM -W0703 07:14:36.200000 140544871081792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1100100 closing signal SIGTERM -W0703 07:14:36.200000 140544871081792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1100101 closing signal SIGTERM -W0703 07:14:36.212000 140368660313920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1899634 closing signal SIGTERM -W0703 07:14:36.212000 140368660313920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1899638 closing signal SIGTERM -W0703 07:14:36.212000 140368660313920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1899640 closing signal SIGTERM -W0703 07:14:36.214000 139792313845568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 562995 closing signal SIGTERM -W0703 07:14:36.215000 139792313845568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 562996 closing signal SIGTERM -W0703 07:14:36.215000 139792313845568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 562997 closing signal SIGTERM -W0703 07:14:36.215000 139792313845568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 562998 closing signal SIGTERM -W0703 07:14:36.215000 139792313845568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 563000 closing signal SIGTERM -W0703 07:14:36.215000 139792313845568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 563001 closing signal SIGTERM -W0703 07:14:36.215000 139792313845568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 563002 closing signal SIGTERM -W0703 07:14:36.218000 140232542312256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3256768 closing signal SIGTERM -W0703 07:14:36.222000 140574532601664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 810832 closing signal SIGTERM -W0703 07:14:36.222000 140574532601664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 810833 closing signal SIGTERM -W0703 07:14:36.222000 140574532601664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 810834 closing signal SIGTERM -W0703 07:14:36.222000 140574532601664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 810835 closing signal SIGTERM -W0703 07:14:36.222000 140574532601664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 810836 closing signal SIGTERM -W0703 07:14:36.222000 140574532601664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 810837 closing signal SIGTERM -W0703 07:14:36.222000 140574532601664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 810839 closing signal SIGTERM -W0703 07:14:36.246000 139999424665408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1872703 closing signal SIGTERM -W0703 07:14:36.247000 139999424665408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1872704 closing signal SIGTERM -W0703 07:14:36.247000 139999424665408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1872705 closing signal SIGTERM -W0703 07:14:36.247000 139999424665408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1872706 closing signal SIGTERM -W0703 07:14:36.247000 139999424665408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1872707 closing signal SIGTERM -W0703 07:14:36.247000 139999424665408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1872708 closing signal SIGTERM -W0703 07:14:36.247000 139999424665408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1872710 closing signal SIGTERM -E0703 07:14:36.566000 140232542312256 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3256763) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 07:14:36.578000 140232542312256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3256690_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:36.608000 140232542312256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3256690_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:36.634000 140232542312256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3256690_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_07:14:36 - host : ip-26-0-163-226.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 3256764) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3256764 -[2]: - time : 2024-07-03_07:14:36 - host : ip-26-0-163-226.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 3256765) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3256765 -[3]: - time : 2024-07-03_07:14:36 - host : ip-26-0-163-226.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 3256766) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3256766 -[4]: - time : 2024-07-03_07:14:36 - host : ip-26-0-163-226.ec2.internal - rank : 28 (local_rank: 4) - exitcode : -6 (pid: 3256767) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3256767 -[5]: - time : 2024-07-03_07:14:36 - host : ip-26-0-163-226.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 3256769) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3256769 -[6]: - time : 2024-07-03_07:14:36 - host : ip-26-0-163-226.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 3256770) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3256770 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:14:36 - host : ip-26-0-163-226.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 3256763) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3256763 -============================================================ -srun: error: ip-26-0-163-226: task 3: Exited with exit code 1 -E0703 07:14:37.352000 140368660313920 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1899633) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 07:14:37.367000 140368660313920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1899559_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:37.399000 140368660313920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1899559_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:37.418000 140368660313920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1899559_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_07:14:36 - host : ip-26-0-168-238.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 1899635) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1899635 -[2]: - time : 2024-07-03_07:14:36 - host : ip-26-0-168-238.ec2.internal - rank : 35 (local_rank: 3) - exitcode : -6 (pid: 1899636) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1899636 -[3]: - time : 2024-07-03_07:14:36 - host : ip-26-0-168-238.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 1899637) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1899637 -[4]: - time : 2024-07-03_07:14:36 - host : ip-26-0-168-238.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 1899639) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1899639 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:14:36 - host : ip-26-0-168-238.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 1899633) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1899633 -============================================================ -E0703 07:14:37.921000 140544871081792 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 1100096) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 07:14:37.934000 140544871081792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1100021_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:37.966000 140544871081792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1100021_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:37.977000 140544871081792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1100021_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_07:14:36 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 1100098) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1100098 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:14:36 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : -6 (pid: 1100096) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1100096 -============================================================ -E0703 07:14:38.018000 139792313845568 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 4 (pid: 562999) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -W0703 07:14:38.031000 139792313845568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_562922_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 07:14:38.037000 140574532601664 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 6 (pid: 810838) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 07:14:38.049000 140574532601664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_810759_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:38.065000 139792313845568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_562922_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:38.073000 139792313845568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_562922_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:14:36 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 562999) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 562999 -============================================================ -W0703 07:14:38.078000 140574532601664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_810759_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:38.087000 140574532601664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_810759_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:14:36 - host : ip-26-0-163-220.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 810838) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 810838 -============================================================ -E0703 07:14:38.342000 139999424665408 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 6 (pid: 1872709) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 07:14:38.354000 139999424665408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1872628_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:38.389000 139999424665408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1872628_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:14:38.398000 139999424665408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1872628_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:14:36 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 1872709) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1872709 -============================================================ -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 2: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/bench.slurm deleted file mode 100644 index 6be41cb467398a047b93dffefeff3d741b6beace..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/config.yaml deleted file mode 100644 index 202fc85a93521766b58a659ebb8c2b1af187d47a..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 1 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 1024 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/log.out deleted file mode 100644 index 8431b047d0558829ba4e6b7da9f817ca47ac2913..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/log.out +++ /dev/null @@ -1,6355 +0,0 @@ -======================== -START TIME: Tue Jul 2 21:44:57 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0702 21:45:04.281000 140397136058176 torch/distributed/run.py:757] -W0702 21:45:04.281000 140397136058176 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.281000 140397136058176 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 21:45:04.281000 140397136058176 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.284000 139776129410880 torch/distributed/run.py:757] -W0702 21:45:04.284000 139776129410880 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.284000 139776129410880 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 21:45:04.284000 139776129410880 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.386000 140046963582784 torch/distributed/run.py:757] -W0702 21:45:04.386000 140046963582784 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.386000 140046963582784 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 21:45:04.386000 140046963582784 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.422000 140469081098048 torch/distributed/run.py:757] -W0702 21:45:04.422000 140469081098048 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.422000 140469081098048 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 21:45:04.422000 140469081098048 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.607000 139990385469248 torch/distributed/run.py:757] -W0702 21:45:04.607000 139990385469248 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.607000 139990385469248 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 21:45:04.607000 139990385469248 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.634000 140106095568704 torch/distributed/run.py:757] -W0702 21:45:04.634000 140106095568704 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.634000 140106095568704 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 21:45:04.634000 140106095568704 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.851000 139678711797568 torch/distributed/run.py:757] -W0702 21:45:04.851000 139678711797568 torch/distributed/run.py:757] ***************************************** -W0702 21:45:04.851000 139678711797568 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 21:45:04.851000 139678711797568 torch/distributed/run.py:757] ***************************************** -W0702 21:45:05.792000 139896234325824 torch/distributed/run.py:757] -W0702 21:45:05.792000 139896234325824 torch/distributed/run.py:757] ***************************************** -W0702 21:45:05.792000 139896234325824 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 21:45:05.792000 139896234325824 torch/distributed/run.py:757] ***************************************** -[default0]:07/02/2024 21:45:31 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=2, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=32, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=1024, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024')), -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/02/2024 21:45:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=0|ip-26-0-162-233]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=3|ip-26-0-162-233]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=1|ip-26-0-162-233]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=2|ip-26-0-162-233]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=5|ip-26-0-162-233]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=4|ip-26-0-162-233]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=7|ip-26-0-162-233]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=6|ip-26-0-162-233]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/02/2024 21:45:50 [INFO|DP=0|PP=1|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/02/2024 21:45:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/02/2024 21:45:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/02/2024 21:45:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/02/2024 21:45:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/02/2024 21:45:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/02/2024 21:45:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/02/2024 21:45:53 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 21:45:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/02/2024 21:45:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/02/2024 21:45:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/02/2024 21:45:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-02 21:45:55.822622 | mbs: 1024 | grad_accum: 1 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/02/2024 21:45:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/02/2024 21:45:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=28|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 21:45:55 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=26|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 21:45:55 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=25|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=27|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=29|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=31|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=23|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=21|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=16|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 21:45:55 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=17|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=20|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=27|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=30|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=26|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=16|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=24|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=30|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=19|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=29|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=28|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=24|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=31|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=25|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=18|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=22|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=22|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=20|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=17|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=21|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=23|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=18|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 21:45:56 [WARNING|DP=0|PP=1|TP=19|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 21:45:56 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank21]: output = self.o_proj(attention_output) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.06 GiB is free. Including non-PyTorch memory, this process has 74.26 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank16]: output = self.o_proj(attention_output) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: return row_linear( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: output = model(**micro_batch) -[default4]:[rank20]: output = model(**micro_batch) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: sharded_logits = self.model( -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: output = self.o_proj(attention_output) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: output = self.o_proj(attention_output) -[default4]:[rank20]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.17 GiB is free. Including non-PyTorch memory, this process has 74.15 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.06 GiB is free. Including non-PyTorch memory, this process has 74.26 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: output = self.o_proj(attention_output) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.06 GiB is free. Including non-PyTorch memory, this process has 74.26 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank29]: output = self.o_proj(attention_output) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: output = self.o_proj(attention_output) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: return row_linear( -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.42 GiB is free. Including non-PyTorch memory, this process has 73.90 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: output = self.o_proj(attention_output) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.42 GiB is free. Including non-PyTorch memory, this process has 73.90 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank28]: output = self.o_proj(attention_output) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: return row_linear( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.24 GiB is free. Including non-PyTorch memory, this process has 74.08 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/na forward -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank25]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[ranotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -nk25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.42 GiB is free. Including non-PyTorch memory, this process has 73.90 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large tr[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: y setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) - return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: [default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = model(**micro_batch) -nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.94 GiB is free. Including non-PyTorch memory, this process has 73.38 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is l[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -arge try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: output = self.o_proj(attention_output) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.24 GiB is free. Including non-PyTorch memory, this process has 74.08 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank30]: output = self.o_proj(attention_output) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.24 GiB is free. Including non-PyTorch memory, this process has 74.08 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", li forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, *ne 598, in forward -[default0]:[rank0]: output = self.o_proj(attention_output) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -*kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: output = self.o_proj(attention_output) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.17 GiB is free. Including non-PyTorch memory, this process has 74.15 GiB memory in use. Of the allocated memory 62.85 Giron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -B is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: output = self.o_proj(attention_output) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.42 GiB is free. Including non-PyTorch memory, this process has 73.90 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: output = self.o_proj(attention_output) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.06 GiB is free. Including non-PyTorch memory, this process has 74.26 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.17 GiB is free. Including non-PyTorch memory, this process has 74.15 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank8]: output = self.o_proj(attention_output) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.17 GiB is free. Including non-PyTorch memory, this process has 74.15 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: output = self.o_proj(attention_output) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.17 GiB is free. Including non-PyTorch memory, this process has 74.15 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: Traceback (most recent call last): -[default6]:[rank6]: Traceback (most recent call last): -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: trainer.train(dataloader) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: sharded_logits = self.model( -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: sharded_logits = self.model( -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default6]:[rank6]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: output = self.o_proj(attention_output) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.94 GiB is free. Including non-PyTorch memory, this process has 73.38 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.05 GiB is free. Including non-PyTorch memory, this process has 73.27 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: sharded_logits = self.model( -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: sharded_logits = self.model( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: return row_linear( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.94 GiB is free. Including non-PyTorch memory, this process has 73.38 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.94 GiB is free. Including non-PyTorch memory, this process has 73.38 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_clust[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -er/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_sta[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -tes = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packag[default2]:[rank10]: trainer.train(dataloader) -[default6]:[rank14]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -es/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank23]: output = self.o_proj(attention_output) -[defau[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -lt7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.05 GiB is free. Including non-PyTorch memory, this process has 73.27 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default2]:[rank10]: output = model(**micro_batch) -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.06 GiB is free. Including non-PyTorch memory, this process has 74.26 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nano[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -tron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: output = self.o_proj(attention_output) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.05 GiB is free. Including non-PyTorch memory, this process has 73.27 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-p[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -ackages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank22]: output = self.o_proj(attention_output) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.17 GiB is free. Including non-PyTorch memory, this process has 74.15 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank10]: output = self.o_proj(attention_output) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: output = self.o_proj(attention_output) -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.06 GiB is free. Including non-PyTorch memory, this process has 74.26 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.06 GiB is free. Including non-PyTorch memory, this process has 74.26 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank18]: output = self.o_proj(attention_output) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.17 GiB is free. Including non-PyTorch memory, this process has 74.15 GiB memory in use. Of the allocated memory 62.85 GiB is allocated by PyTorch, and 1.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default1]:[rank41]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank41]: dist.recv( -[default0]:[rank40]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff4dee46897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank40]: frame #1: + 0x5b3a23e (0x7ff51896323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff51895dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff51895df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff51895efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff518913371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff518913371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff518913371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff518913371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff4e0120189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff4e0127610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff4e0146978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank41]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f88fe95d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]: frame #1: + 0x5b3a23e (0x7f893847a23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #12: + 0x5adc309 (0x7ff518905309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f8938474c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f8938474f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f8938475fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f893842a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #13: + 0x5ae6f10 (0x7ff51890ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f893842a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f893842a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f893842a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #14: + 0x5ae6fa5 (0x7ff51890ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f88ffc37189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #15: + 0x5124446 (0x7ff517f4d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f88ffc3e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f88ffc5d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #12: + 0x5adc309 (0x7f893841c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #16: + 0x1acf4b8 (0x7ff5148f84b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #17: + 0x5aee004 (0x7ff518917004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #13: + 0x5ae6f10 (0x7f8938426f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #18: + 0x5af36b5 (0x7ff51891c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #19: + 0xd2631e (0x7ff52b50631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #20: + 0x47def4 (0x7ff52ac5def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #21: + 0x1445a6 (0x55777de395a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55777de32a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #14: + 0x5ae6fa5 (0x7f8938426fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #23: + 0x150866 (0x55777de45866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55777de2e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #15: + 0x5124446 (0x7f8937a64446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #16: + 0x1acf4b8 (0x7f893440f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #17: + 0x5aee004 (0x7f893842e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55777de39a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #18: + 0x5af36b5 (0x7f89384336b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #26: PyObject_Call + 0xbc (0x55777de45f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #19: + 0xd2631e (0x7f894b01d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55777de2c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #20: + 0x47def4 (0x7f894a774ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55777de39a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55777de2a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #21: + 0x1445a6 (0x5640def945a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #30: + 0x150582 (0x55777de45582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55777de2a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #32: + 0x150582 (0x55777de45582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55777de2a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5640def8da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #23: + 0x150866 (0x5640defa0866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #34: + 0x150582 (0x55777de45582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55777de2a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55777de31f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5640def89142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5640def94a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #26: PyObject_Call + 0xbc (0x5640defa0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5640def872b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5640def94a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5640def858fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #30: + 0x150582 (0x5640defa0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55777de43c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #38: + 0x211239 (0x55777df06239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5640def858fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55777de32a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #32: + 0x150582 (0x5640defa0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5640def858fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #34: + 0x150582 (0x5640defa0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55777de2e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55777de39a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5640def858fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5640def8cf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55777de29c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55777de39a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55777de2a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #45: + 0x150582 (0x55777de45582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5640def9ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #38: + 0x211239 (0x5640df061239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #46: PyObject_Call + 0xbc (0x55777de45f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5640def8da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5640def893e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5640def94a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5640def84c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5640def94a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5640def858fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #45: + 0x150582 (0x5640defa0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55777de2c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #48: + 0x150582 (0x55777de45582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #46: PyObject_Call + 0xbc (0x5640defa0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #49: PyObject_Call + 0xbc (0x55777de45f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5640def872b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #48: + 0x150582 (0x5640defa0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55777de2c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55777de39a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55777de32007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #49: PyObject_Call + 0xbc (0x5640defa0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5640def872b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5640def94a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55777de43c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #54: + 0x211239 (0x55777df06239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #55: PyObject_Call + 0x207 (0x55777de46067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5640def8d007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55777de2c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #57: + 0x150582 (0x55777de45582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5640def9ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55777de2a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #54: + 0x211239 (0x5640df061239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #55: PyObject_Call + 0x207 (0x5640defa1067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #59: + 0x150582 (0x55777de45582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5640def872b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #60: PyObject_Call + 0xbc (0x55777de45f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55777de2c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #62: + 0x150582 (0x55777de45582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #63: PyObject_Call + 0xbc (0x55777de45f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank41]: frame #57: + 0x150582 (0x5640defa0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5640def858fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #59: + 0x150582 (0x5640defa0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #60: PyObject_Call + 0xbc (0x5640defa0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5640def872b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #62: + 0x150582 (0x5640defa0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #63: PyObject_Call + 0xbc (0x5640defa0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank42]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2975aa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #1: + 0x5b3a23e (0x7fa2d10c723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa2d10c1c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa2d10c1f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa2d10c2fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa2d1077371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa2d1077371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa2d1077371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa2d1077371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa298884189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa29888b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa2988aa978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #12: + 0x5adc309 (0x7fa2d1069309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #13: + 0x5ae6f10 (0x7fa2d1073f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #14: + 0x5ae6fa5 (0x7fa2d1073fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #15: + 0x5124446 (0x7fa2d06b1446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #16: + 0x1acf4b8 (0x7fa2cd05c4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #17: + 0x5aee004 (0x7fa2d107b004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #18: + 0x5af36b5 (0x7fa2d10806b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #19: + 0xd2631e (0x7fa2e3c6a31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #20: + 0x47def4 (0x7fa2e33c1ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #21: + 0x1445a6 (0x56035b6075a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56035b600a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #23: + 0x150866 (0x56035b613866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56035b5fc142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56035b607a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #26: PyObject_Call + 0xbc (0x56035b613f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56035b5fa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56035b607a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56035b5f88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #30: + 0x150582 (0x56035b613582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56035b5f88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #32: + 0x150582 (0x56035b613582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: Traceback (most recent call last): -[default2]:[rank42]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56035b5f88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #34: + 0x150582 (0x56035b613582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56035b5f88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56035b5fff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56035b611c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #38: + 0x211239 (0x56035b6d4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56035b600a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56035b5fc3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56035b607a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56035b5f7c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56035b607a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56035b5f88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #45: + 0x150582 (0x56035b613582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #46: PyObject_Call + 0xbc (0x56035b613f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56035b5fa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #48: + 0x150582 (0x56035b613582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #49: PyObject_Call + 0xbc (0x56035b613f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56035b5fa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56035b607a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56035b600007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: Traceback (most recent call last): -[default2]:[rank42]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56035b611c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: frame #54: + 0x211239 (0x56035b6d4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: frame #55: PyObject_Call + 0x207 (0x56035b614067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56035b5fa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #57: + 0x150582 (0x56035b613582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56035b5f88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #59: + 0x150582 (0x56035b613582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: frame #60: PyObject_Call + 0xbc (0x56035b613f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56035b5fa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #62: + 0x150582 (0x56035b613582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #63: PyObject_Call + 0xbc (0x56035b613f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: trainer.train(dataloader) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: sharded_logits = self.model( -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/py[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -thon3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=in[default7]:[rank39]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -put_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank46]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f93fd8a2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank46]: frame #1: + 0x5b3a23e (0x7f94373bf23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: pipeline_state.run_communication() -[default6]:[rank46]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f94373b9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f94373b9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f94373bafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f943736f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f943736f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f943736f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f943736f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f93feb7c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f93feb83610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f93feba2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #12: + 0x5adc309 (0x7f9437361309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #13: + 0x5ae6f10 (0x7f943736bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: frame #14: + 0x5ae6fa5 (0x7f943736bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #15: + 0x5124446 (0x7f94369a9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: frame #16: + 0x1acf4b8 (0x7f94333544b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: frame #17: + 0x5aee004 (0x7f9437373004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: frame #18: + 0x5af36b5 (0x7f94373786b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #19: + 0xd2631e (0x7f9449f6231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: frame #20: + 0x47def4 (0x7f94496b9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: frame #21: + 0x1445a6 (0x55ce31ea65a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55ce31e9fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #23: + 0x150866 (0x55ce31eb2866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55ce31e9b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: dist.recv( -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55ce31ea6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default6]:[rank46]: frame #26: PyObject_Call + 0xbc (0x55ce31eb2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: dist.recv( -[default6]:[rank46]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55ce31e992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: output = model(**micro_batch) -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55ce31ea6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55ce31e978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #30: + 0x150582 (0x55ce31eb2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: return func(*args, **kwargs) -[default1]:[rank33]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55ce31e978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank46]: frame #32: + 0x150582 (0x55ce31eb2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55ce31e978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: sharded_logits = self.model( -[default1]:[rank33]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fecf1b81897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank46]: frame #34: + 0x150582 (0x55ce31eb2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3eb3ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: frame #1: + 0x5b3a23e (0x7fed2b69e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55ce31e978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55ce31e9ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55ce31eb0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #1: + 0x5b3a23e (0x7ff424ec923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fed2b698c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank39]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff424ec3c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fed2b698f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff424ec3f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default6]:[rank46]: frame #38: + 0x211239 (0x55ce31f73239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fed2b699fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55ce31e9fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fed2b64e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank46]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55ce31e9b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fed2b64e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff424ec4fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: dist.recv( -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fed2b64e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55ce31ea6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff424e79371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fed2b64e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55ce31e96c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: return func(*args, **kwargs) -[default6]:[rank46]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55ce31ea6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fecf2e5b189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff424e79371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55ce31e978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: frame #45: + 0x150582 (0x55ce31eb2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #46: PyObject_Call + 0xbc (0x55ce31eb2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff424e79371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fecf2e62610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55ce31e992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #48: + 0x150582 (0x55ce31eb2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff424e79371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #49: PyObject_Call + 0xbc (0x55ce31eb2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fecf2e81978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #12: + 0x5adc309 (0x7fed2b640309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff3ec686189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: frame #13: + 0x5ae6f10 (0x7fed2b64af10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank34]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank46]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55ce31e992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #14: + 0x5ae6fa5 (0x7fed2b64afa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55ce31ea6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: dist.recv( -[default7]:[rank39]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff3ec68d610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe2221f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff3ec6ac978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55ce31e9f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #1: + 0x5b3a23e (0x7fe25bd1123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: return func(*args, **kwargs) -[default1]:[rank33]: frame #15: + 0x5124446 (0x7fed2ac88446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #12: + 0x5adc309 (0x7ff424e6b309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fe25bd0bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #13: + 0x5ae6f10 (0x7ff424e75f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fe25bd0bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55ce31eb0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fe25bd0cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe25bcc1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #54: + 0x211239 (0x55ce31f73239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe25bcc1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #55: PyObject_Call + 0x207 (0x55ce31eb3067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55ce31e992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #16: + 0x1acf4b8 (0x7fed276334b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #14: + 0x5ae6fa5 (0x7ff424e75fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #57: + 0x150582 (0x55ce31eb2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe25bcc1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank34]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe25bcc1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #17: + 0x5aee004 (0x7fed2b652004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55ce31e978fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #18: + 0x5af36b5 (0x7fed2b6576b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #59: + 0x150582 (0x55ce31eb2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #60: PyObject_Call + 0xbc (0x55ce31eb2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #15: + 0x5124446 (0x7ff4244b3446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55ce31e992b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fe2234ce189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #62: + 0x150582 (0x55ce31eb2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #63: PyObject_Call + 0xbc (0x55ce31eb2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #16: + 0x1acf4b8 (0x7ff420e5e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fe2234d5610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #19: + 0xd2631e (0x7fed3e24131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fe2234f4978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f7b3ec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: frame #1: + 0x5b3a23e (0x7f8fb4f0923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f8fb4f03c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #20: + 0x47def4 (0x7fed3d998ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f8fb4f03f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f8fb4f04fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8fb4eb9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #17: + 0x5aee004 (0x7ff424e7d004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #18: + 0x5af36b5 (0x7ff424e826b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8fb4eb9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8fb4eb9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8fb4eb9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #21: + 0x1445a6 (0x55a937f1e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f8f7c6c6189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f8f7c6cd610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f8f7c6ec978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a937f17a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #12: + 0x5adc309 (0x7f8fb4eab309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #13: + 0x5ae6f10 (0x7f8fb4eb5f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #12: + 0x5adc309 (0x7fe25bcb3309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #14: + 0x5ae6fa5 (0x7f8fb4eb5fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #19: + 0xd2631e (0x7ff437a6c31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #23: + 0x150866 (0x55a937f2a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #15: + 0x5124446 (0x7f8fb44f3446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #16: + 0x1acf4b8 (0x7f8fb0e9e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #17: + 0x5aee004 (0x7f8fb4ebd004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #13: + 0x5ae6f10 (0x7fe25bcbdf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #18: + 0x5af36b5 (0x7f8fb4ec26b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #19: + 0xd2631e (0x7f8fc7aac31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #20: + 0x47def4 (0x7f8fc7203ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #21: + 0x1445a6 (0x5644df0c85a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #20: + 0x47def4 (0x7ff4371c3ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5644df0c1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #23: + 0x150866 (0x5644df0d4866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5644df0bd142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5644df0c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #21: + 0x1445a6 (0x5567c04d55a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5567c04cea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #26: PyObject_Call + 0xbc (0x5644df0d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5644df0bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5644df0c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5644df0b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #23: + 0x150866 (0x5567c04e1866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #30: + 0x150582 (0x5644df0d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5644df0b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #32: + 0x150582 (0x5644df0d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a937f13142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5644df0b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #34: + 0x150582 (0x5644df0d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5644df0b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5644df0c0f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a937f1ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5644df0d2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #38: + 0x211239 (0x5644df195239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5644df0c1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #14: + 0x5ae6fa5 (0x7fe25bcbdfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5644df0bd3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5644df0c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5644df0b8c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5644df0c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5567c04ca142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5644df0b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #45: + 0x150582 (0x5644df0d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #46: PyObject_Call + 0xbc (0x5644df0d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5644df0bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5567c04d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #26: PyObject_Call + 0xbc (0x55a937f2af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #15: + 0x5124446 (0x7fe25b2fb446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #48: + 0x150582 (0x5644df0d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #49: PyObject_Call + 0xbc (0x5644df0d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5644df0bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #26: PyObject_Call + 0xbc (0x5567c04e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5644df0c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5644df0c1007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5644df0d2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #54: + 0x211239 (0x5644df195239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #16: + 0x1acf4b8 (0x7fe257ca64b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5567c04c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a937f112b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #17: + 0x5aee004 (0x7fe25bcc5004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5567c04d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #18: + 0x5af36b5 (0x7fe25bcca6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #19: + 0xd2631e (0x7fe26e8b431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5567c04c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a937f1ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #55: PyObject_Call + 0x207 (0x5644df0d5067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5644df0bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #57: + 0x150582 (0x5644df0d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #20: + 0x47def4 (0x7fe26e00bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #30: + 0x150582 (0x5567c04e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5644df0b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #59: + 0x150582 (0x5644df0d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #60: PyObject_Call + 0xbc (0x5644df0d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5644df0bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #62: + 0x150582 (0x5644df0d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #63: PyObject_Call + 0xbc (0x5644df0d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank33]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a937f0f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5567c04c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #21: + 0x1445a6 (0x55ab037185a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55ab03711a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #30: + 0x150582 (0x55a937f2a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #32: + 0x150582 (0x5567c04e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #23: + 0x150866 (0x55ab03724866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a937f0f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55ab0370d142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55ab03718a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5567c04c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #34: + 0x150582 (0x5567c04e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #32: + 0x150582 (0x55a937f2a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #26: PyObject_Call + 0xbc (0x55ab03724f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank33]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a937f0f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55ab0370b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55ab03718a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5567c04c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5567c04cdf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5567c04dfc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55ab037098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #34: + 0x150582 (0x55a937f2a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #38: + 0x211239 (0x5567c05a2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #30: + 0x150582 (0x55ab03724582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5567c04cea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default1]:[rank33]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a937f0f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in[default7]:[rank39]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5567c04ca3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) - forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipe[default2]:[rank34]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55ab037098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -line_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a937f16f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5567c04d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site[default2]:[rank34]: frame #32: + 0x150582 (0x55ab03724582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5567c04c5c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) --packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank43]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank43]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f12eb74d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank43]: frame #1: + 0x5b3a23e (0x7f132526a23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration + 0x150582 (0x55ab03724582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -> >) + 0x2c7 (0x7f1325264c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1325264f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1325265fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f132521a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f132521a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #7: c10d::PrefixStore::get(st[default1]:[rank33]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a937f28c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55ab037098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -d::string const&) + 0x31 (0x7f132521a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f132521a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f12eca27189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f12eca2e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55ab03710f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f12eca4d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #12: + 0x5adc309 (0x7f132520c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #13: + 0x5ae6f10 (0x7f1325216f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #38: + 0x211239 (0x55a937feb239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #14: + 0x5ae6fa5 (0x7f1325216fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #15: + 0x5124446 (0x7f1324854446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #16: + 0x1acf4b8 (0x7f13211ff4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #17: + 0x5aee004 (0x7f132521e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #18: + 0x5af36b5 (0x7f13252236b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55ab03722c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #19: + 0xd2631e (0x7f1337e0d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #20: + 0x47def4 (0x7f1337564ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #21: + 0x1445a6 (0x55c4f4c245a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a937f17a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c4f4c1da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #23: + 0x150866 (0x55c4f4c30866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c4f4c19142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c4f4c24a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #38: + 0x211239 (0x55ab037e5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #26: PyObject_Call + 0xbc (0x55c4f4c30f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c4f4c172b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c4f4c24a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c4f4c158fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #30: + 0x150582 (0x55c4f4c30582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c4f4c158fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #32: + 0x150582 (0x55c4f4c30582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cl[default7]:[rank39]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5567c04d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -uster/bin/python3.10) -[default3]:[rank43]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c4f4c158fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #34: + 0x150582 (0x55c4f4c30582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c4f4c158fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c4f4c1cf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c4f4c2ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #38: + 0x211239 (0x55c4f4cf1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c4f4c1da6b in /fsx/ferdinand[default1]:[rank33]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a937f133e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -mom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c4f4c193e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c4f4c24a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c4f4c14c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c4f4c24a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55ab03711a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5567c04c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c4f4c158fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #45: + 0x150582 (0x55c4f4c30582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #46: PyObject_Call + 0xbc (0x55c4f4c30f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c4f4c172b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #48: + 0x150582 (0x55c4f4c30582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #49: PyObject_Call + 0xbc (0x55c4f4c30f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c4f4c172b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin[default1]:[rank33]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a937f1ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55ab0370d3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -/python3.10) -[default7]:[rank39]: frame #45: + 0x150582 (0x5567c04e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #46: PyObject_Call + 0xbc (0x5567c04e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c4f4c24a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c4f4c1d007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c4f4c2ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #54: + 0x211239 (0x55c4f4cf1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #55: PyObject_Call + 0x207 (0x55c4f4c31067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55ab03718a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55ab03708c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c4f4c172b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #57: + 0x150582 (0x55c4f4c30582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c4f4c158fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #59: + 0x150582 (0x55c4f4c30582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5567c04c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #60: PyObject_Call + 0xbc (0x55c4f4c30f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c4f4c172b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #62: + 0x150582 (0x55c4f4c30582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55ab03718a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55ab037098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #63: PyObject_Call + 0xbc (0x55c4f4c30f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank39]: frame #48: + 0x150582 (0x5567c04e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #45: + 0x150582 (0x55ab03724582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a937f0ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #46: PyObject_Call + 0xbc (0x55ab03724f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a937f1ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a937f0f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55ab0370b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #49: PyObject_Call + 0xbc (0x5567c04e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5567c04c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #48: + 0x150582 (0x55ab03724582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #45: + 0x150582 (0x55a937f2a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5567c04d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5567c04ce007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #46: PyObject_Call + 0xbc (0x55a937f2af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #49: PyObject_Call + 0xbc (0x55ab03724f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a937f112b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55ab0370b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5567c04dfc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55ab03718a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: frame #54: + 0x211239 (0x5567c05a2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #55: PyObject_Call + 0x207 (0x5567c04e2067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: frame #48: + 0x150582 (0x55a937f2a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55ab03711007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: frame #49: PyObject_Call + 0xbc (0x55a937f2af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5567c04c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #57: + 0x150582 (0x5567c04e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default2]:[rank34]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55ab03722c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: frame #54: + 0x211239 (0x55ab037e5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5567c04c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a937f112b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank34]: frame #55: PyObject_Call + 0x207 (0x55ab03725067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55ab0370b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default7]:[rank39]: frame #59: + 0x150582 (0x5567c04e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #60: PyObject_Call + 0xbc (0x5567c04e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank33]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a937f1ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f50c3294897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]: frame #1: + 0x5b3a23e (0x7f50fcdb123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f50fcdabc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #57: + 0x150582 (0x55ab03724582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5567c04c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f50fcdabf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f50fcdacfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f50fcd61371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a937f17007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #62: + 0x150582 (0x5567c04e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f50fcd61371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f50fcd61371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f50fcd61371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55ab037098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f50c456e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f50c4575610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f50c4594978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #63: PyObject_Call + 0xbc (0x5567c04e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #12: + 0x5adc309 (0x7f50fcd53309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #13: + 0x5ae6f10 (0x7f50fcd5df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a937f28c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #14: + 0x5ae6fa5 (0x7f50fcd5dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #15: + 0x5124446 (0x7f50fc39b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #16: + 0x1acf4b8 (0x7f50f8d464b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #17: + 0x5aee004 (0x7f50fcd65004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #18: + 0x5af36b5 (0x7f50fcd6a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #19: + 0xd2631e (0x7f510f954[default7]:[rank39]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: frame #59: + 0x150582 (0x55ab03724582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #20: + 0x47def4 (0x7f510f0abef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #21: + 0x1445a6 (0x5639dd83e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5639dd837a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #23: + 0x150866 (0x5639dd84a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #54: + 0x211239 (0x55a937feb239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5639dd833142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5639dd83ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #26: PyObject_Call + 0xbc (0x5639dd84af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5639dd8312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #60: PyObject_Call + 0xbc (0x55ab03724f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5639dd83ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5639dd82f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #30: + 0x150582 (0x5639dd84a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5639dd82f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #55: PyObject_Call + 0x207 (0x55a937f2b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #32: + 0x150582 (0x5639dd84a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5639dd82f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #34: + 0x150582 (0x5639dd84a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55ab0370b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #62: + 0x150582 (0x55ab03724582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5639dd82f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5639dd836f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5639dd848c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #38: + 0x211239 (0x5639dd90b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #63: PyObject_Call + 0xbc (0x55ab03724f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5639dd837a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5639dd8333e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5639dd83ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5639dd82ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a937f112b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5639dd83ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5639dd82f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #45: + 0x150582 (0x5639dd84a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #46: PyObject_Call + 0xbc (0x5639dd84af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5639dd8312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #48: + 0x150582 (0x5639dd84a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #49: PyObject_Call + 0xbc (0x5639dd84af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/pyt[default1]:[rank33]: frame #57: + 0x150582 (0x55a937f2a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: . This may indicate a possible application crash on rank 0 or a network set up issue. -hon3.10) -[default5]:[rank45]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5639dd8312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5639dd83ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a937f0f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5639dd837007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5639dd848c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #54: + 0x211239 (0x5639dd90b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #55: PyObject_Call + 0x207 (0x5639dd84b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5639dd8312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #59: + 0x150582 (0x55a937f2a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #60: PyObject_Call + 0xbc (0x55a937f2af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #57: + 0x150582 (0x5639dd84a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5639dd82f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #59: + 0x150582 (0x5639dd84a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a937f112b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #62: + 0x150582 (0x55a937f2a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #63: PyObject_Call + 0xbc (0x55a937f2af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #60: PyObject_Call + 0xbc (0x5639dd84af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5639dd8312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #62: + 0x150582 (0x5639dd84a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #63: PyObject_Call + 0xbc (0x5639dd84af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank33]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank44]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f496d0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: frame #1: + 0x5b3a23e (0x7f9f831ed23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f9f831e7c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f9f831e7f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f9f831e8fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9f8319d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9f8319d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9f8319d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9f8319d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f9f4a9aa189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f9f4a9b1610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f9f4a9d0978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #12: + 0x5adc309 (0x7f9f8318f309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #13: + 0x5ae6f10 (0x7f9f83199f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #14: + 0x5ae6fa5 (0x7f9f83199fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #15: + 0x5124446 (0x7f9f827d7446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #16: + 0x1acf4b8 (0x7f9f7f1824b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #17: + 0x5aee004 (0x7f9f831a1004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #18: + 0x5af36b5 (0x7f9f831a66b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #19: + 0xd2631e (0x7f9f95d9031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #20: + 0x47def4 (0x7f9f954e7ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #21: + 0x1445a6 (0x559e65c695a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559e65c62a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #23: + 0x150866 (0x559e65c75866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559e65c5e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559e65c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #26: PyObject_Call + 0xbc (0x559e65c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559e65c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559e65c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559e65c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #30: + 0x150582 (0x559e65c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559e65c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #32: + 0x150582 (0x559e65c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559e65c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #34: + 0x150582 (0x559e65c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559e65c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559e65c61f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559e65c73c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #38: + 0x211239 (0x559e65d36239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559e65c62a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559e65c5e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559e65c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559e65c59c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559e65c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559e65c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #45: + 0x150582 (0x559e65c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #46: PyObject_Call + 0xbc (0x559e65c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559e65c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #48: + 0x150582 (0x559e65c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #49: PyObject_Call + 0xbc (0x559e65c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559e65c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559e65c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559e65c62007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559e65c73c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #54: + 0x211239 (0x559e65d36239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #55: PyObject_Call + 0x207 (0x559e65c76067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559e65c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #57: + 0x150582 (0x559e65c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559e65c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #59: + 0x150582 (0x559e65c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #60: PyObject_Call + 0xbc (0x559e65c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559e65c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #62: + 0x150582 (0x559e65c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #63: PyObject_Call + 0xbc (0x559e65c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default0]:[rank56]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: Fi[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -le "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank36]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank36]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f566a79f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: frame #1: + 0x5b3a23e (0x7f56a42bc23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f56a42b6c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f56a42b6f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f56a42b7fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f56a426c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f56a426c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f56a426c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f56a426c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f566ba79189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f566ba80610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f566ba9f978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #12: + 0x5adc309 (0x7f56a425e309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #13: + 0x5ae6f10 (0x7f56a4268f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: output = model(**micro_batch) -[default4]:[rank36]: frame #14: + 0x5ae6fa5 (0x7f56a4268fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #15: + 0x5124446 (0x7f56a38a6446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #16: + 0x1acf4b8 (0x7f56a02514b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: frame #17: + 0x5aee004 (0x7f56a4270004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #18: + 0x5af36b5 (0x7f56a42756b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #19: + 0xd2631e (0x7f56b6e5f31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: frame #20: + 0x47def4 (0x7f56b65b6ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #21: + 0x1445a6 (0x559a03f8f5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559a03f88a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #23: + 0x150866 (0x559a03f9b866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559a03f84142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559a03f8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #26: PyObject_Call + 0xbc (0x559a03f9bf1c in /fsx/ferdina[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -ndmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559a03f822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559a03f8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559a03f808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #30: + 0x150582 (0x559a03f9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559a03f808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: frame #32: + 0x150582 (0x559a03f9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559a03f808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #34: + 0x150582 (0x559a03f9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559a03f808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559a03f87f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559a03f99c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #38: + 0x211239 (0x559a0405c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559a03f88a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: sharded_logits = self.model( -[default6]:[rank62]: output = model(**micro_batch) -[default4]:[rank36]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559a03f843e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559a03f8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559a03f7fc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559a03f8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559a03f808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #45: + 0x150582 (0x559a03f9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #46: PyObject_Call + 0xbc (0x559a03f9bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559a03f822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #48: + 0x150582 (0x559a03f9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #49: PyObject_Call + 0xbc (0x559a03f9bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559a03f822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559a03f8fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559a03f88007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559a03f99c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: output = model(**micro_batch) -[default4]:[rank36]: frame #54: + 0x211239 (0x559a0405c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #55: PyObject_Call + 0x207 (0x559a03f9c067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559a03f822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #57: + 0x150582 (0x559a03f9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559a03f808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #59: + 0x150582 (0x559a03f9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #60: PyObject_Call + 0xbc (0x559a03f9bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559a03f822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: frame #62: + 0x150582 (0x559a03f9b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #63: PyObject_Call + 0xbc (0x559a03f9bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: pipeline_state.run_communication() -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: dist.recv( -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: dist.recv( -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41ec4ef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank56]: frame #1: + 0x5b3a23e (0x7f422600c23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: dist.recv( -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4226006c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4226006f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f456fddc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank62]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank62]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank57]: frame #1: + 0x5b3a23e (0x7f45a98f923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcacc57c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank56]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4226007fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #1: + 0x5b3a23e (0x7fcb0609923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f45a98f3c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4225fbc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fcb06093c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4225fbc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f45a98f3f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4225fbc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fcb06093f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fcb06094fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f45a98f4fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcb06049371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f45a98a9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcb06049371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcb06049371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f45a98a9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4225fbc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcb06049371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f45a98a9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f41ed7c9189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fcacd856189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f41ed7d0610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f45a98a9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fcacd85d610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f41ed7ef978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fcacd87c978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f45710b6189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #12: + 0x5adc309 (0x7fcb0603b309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f45710bd610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #13: + 0x5ae6f10 (0x7fcb06045f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f45710dc978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #12: + 0x5adc309 (0x7f45a989b309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #14: + 0x5ae6fa5 (0x7fcb06045fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #12: + 0x5adc309 (0x7f4225fae309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #13: + 0x5ae6f10 (0x7f4225fb8f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #15: + 0x5124446 (0x7fcb05683446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #16: + 0x1acf4b8 (0x7fcb0202e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #14: + 0x5ae6fa5 (0x7f4225fb8fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #17: + 0x5aee004 (0x7fcb0604d004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #13: + 0x5ae6f10 (0x7f45a98a5f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #15: + 0x5124446 (0x7f42255f6446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #18: + 0x5af36b5 (0x7fcb060526b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #19: + 0xd2631e (0x7fcb18c3c31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #20: + 0x47def4 (0x7fcb18393ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #16: + 0x1acf4b8 (0x7f4221fa14b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #14: + 0x5ae6fa5 (0x7f45a98a5fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #21: + 0x1445a6 (0x55a18ee4d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #17: + 0x5aee004 (0x7f4225fc0004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #15: + 0x5124446 (0x7f45a8ee3446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #18: + 0x5af36b5 (0x7f4225fc56b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #16: + 0x1acf4b8 (0x7f45a588e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #19: + 0xd2631e (0x7f4238baf31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #20: + 0x47def4 (0x7f4238306ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a18ee46a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #23: + 0x150866 (0x55a18ee59866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #21: + 0x1445a6 (0x557695fb45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #17: + 0x5aee004 (0x7f45a98ad004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557695fada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #23: + 0x150866 (0x557695fc0866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #18: + 0x5af36b5 (0x7f45a98b26b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a18ee42142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #19: + 0xd2631e (0x7f45bc49c31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a18ee4da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557695fa9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #20: + 0x47def4 (0x7f45bbbf3ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #26: PyObject_Call + 0xbc (0x55a18ee59f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557695fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #21: + 0x1445a6 (0x55cea371f5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #26: PyObject_Call + 0xbc (0x557695fc0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a18ee402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55cea3718a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x557695fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a18ee4da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #23: + 0x150866 (0x55cea372b866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557695fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55cea3714142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55cea371fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a18ee3e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x557695fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #30: + 0x150582 (0x557695fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #26: PyObject_Call + 0xbc (0x55cea372bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #30: + 0x150582 (0x55a18ee59582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a18ee3e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x557695fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #32: + 0x150582 (0x55a18ee59582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55cea37122b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a18ee3e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #32: + 0x150582 (0x557695fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55cea371fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #34: + 0x150582 (0x55a18ee59582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x557695fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55cea37108fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #34: + 0x150582 (0x557695fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x557695fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #30: + 0x150582 (0x55cea372b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a18ee3e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557695facf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557695fbec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a18ee45f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55cea37108fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #38: + 0x211239 (0x557696081239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #32: + 0x150582 (0x55cea372b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a18ee57c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557695fada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x557695fa93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55cea37108fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #34: + 0x150582 (0x55cea372b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55cea37108fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #38: + 0x211239 (0x55a18ef1a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a18ee46a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55cea3717f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a18ee423e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55cea3729c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557695fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a18ee4da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #38: + 0x211239 (0x55cea37ec239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a18ee3dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557695fa4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55cea3718a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55cea37143e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557695fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x557695fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a18ee4da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55cea371fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a18ee3e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #45: + 0x150582 (0x557695fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55cea370fc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #45: + 0x150582 (0x55a18ee59582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #46: PyObject_Call + 0xbc (0x557695fc0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #46: PyObject_Call + 0xbc (0x55a18ee59f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55cea371fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x557695fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a18ee402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55cea37108fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #48: + 0x150582 (0x55a18ee59582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #48: + 0x150582 (0x557695fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #45: + 0x150582 (0x55cea372b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #49: PyObject_Call + 0xbc (0x55a18ee59f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #46: PyObject_Call + 0xbc (0x55cea372bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a18ee402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55cea37122b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a18ee4da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #48: + 0x150582 (0x55cea372b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #49: PyObject_Call + 0xbc (0x557695fc0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a18ee46007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #49: PyObject_Call + 0xbc (0x55cea372bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x557695fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a18ee57c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55cea37122b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55cea371fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #54: + 0x211239 (0x55a18ef1a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557695fb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #55: PyObject_Call + 0x207 (0x55a18ee5a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a18ee402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557695fad007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #57: + 0x150582 (0x55a18ee59582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55cea3718007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55cea3729c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a18ee3e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #59: + 0x150582 (0x55a18ee59582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557695fbec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #60: PyObject_Call + 0xbc (0x55a18ee59f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a18ee402b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #54: + 0x211239 (0x55cea37ec239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #54: + 0x211239 (0x557696081239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #55: PyObject_Call + 0x207 (0x55cea372c067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #62: + 0x150582 (0x55a18ee59582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #55: PyObject_Call + 0x207 (0x557695fc1067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55cea37122b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #63: PyObject_Call + 0xbc (0x55a18ee59f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x557695fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #57: + 0x150582 (0x55cea372b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank56]: frame #57: + 0x150582 (0x557695fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55cea37108fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x557695fa58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #59: + 0x150582 (0x55cea372b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #60: PyObject_Call + 0xbc (0x55cea372bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55cea37122b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #59: + 0x150582 (0x557695fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #60: PyObject_Call + 0xbc (0x557695fc0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #62: + 0x150582 (0x55cea372b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #63: PyObject_Call + 0xbc (0x55cea372bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank56]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x557695fa72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #62: + 0x150582 (0x557695fc0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #63: PyObject_Call + 0xbc (0x557695fc0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank60]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank60]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3a5074897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank60]: frame #1: + 0x5b3a23e (0x7fb3deb9123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb3deb8bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb3deb8bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb3deb8cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb3deb41371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb3deb41371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb3deb41371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb3deb41371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb3a634e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb3a6355610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb3a6374978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #12: + 0x5adc309 (0x7fb3deb33309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #13: + 0x5ae6f10 (0x7fb3deb3df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #14: + 0x5ae6fa5 (0x7fb3deb3dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #15: + 0x5124446 (0x7fb3de17b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #16: + 0x1acf4b8 (0x7fb3dab264b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #17: + 0x5aee004 (0x7fb3deb45004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #18: + 0x5af36b5 (0x7fb3deb4a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #19: + 0xd2631e (0x7fb3f173431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #20: + 0x47def4 (0x7fb3f0e8bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #21: + 0x1445a6 (0x557a9691c5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557a96915a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #23: + 0x150866 (0x557a96928866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557a96911142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557a9691ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #26: PyObject_Call + 0xbc (0x557a96928f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x557a9690f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557a9691ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x557a9690d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #30: + 0x150582 (0x557a96928582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x557a9690d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #32: + 0x150582 (0x557a96928582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x557a9690d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #34: + 0x150582 (0x557a96928582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x557a9690d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557a96914f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557a96926c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #38: + 0x211239 (0x557a969e9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557a96915a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x557a969113e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557a9691ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557a9690cc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557a9691ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x557a9690d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #45: + 0x150582 (0x557a96928582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #46: PyObject_Call + 0xbc (0x557a96928f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x557a9690f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #48: + 0x150582 (0x557a96928582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #49: PyObject_Call + 0xbc (0x557a96928f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x557a9690f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557a9691ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557a96915007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557a96926c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #54: + 0x211239 (0x557a969e9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #55: PyObject_Call + 0x207 (0x557a96929067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x557a9690f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #57: + 0x150582 (0x557a96928582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x557a9690d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #59: + 0x150582 (0x557a96928582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #60: PyObject_Call + 0xbc (0x557a96928f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x557a9690f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #62: + 0x150582 (0x557a96928582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #63: PyObject_Call + 0xbc (0x557a96928f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank38]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank38]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3bc7aa0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank38]: frame #1: + 0x5b3a23e (0x7f3c015bd23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f3c015b7c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f3c015b7f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f3c015b8fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3c0156d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3c0156d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3c0156d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: return func(*args, **kwargs) -[default6]:[rank38]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3c0156d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f3bc8d7a189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f3bc8d81610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f3bc8da0978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: frame #12: + 0x5adc309 (0x7f3c0155f309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #13: + 0x5ae6f10 (0x7f3c01569f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank32]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f35a17897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank32]: frame #1: + 0x5b3a23e (0x7f9f6f53423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f9f6f52ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f9f6f52ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #14: + 0x5ae6fa5 (0x7f3c01569fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f9f6f52ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9f6f4e4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9f6f4e4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9f6f4e4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9f6f4e4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f9f36cf1189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f9f36cf8610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f9f36d17978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #12: + 0x5adc309 (0x7f9f6f4d6309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #13: + 0x5ae6f10 (0x7f9f6f4e0f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #15: + 0x5124446 (0x7f3c00ba7446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #14: + 0x5ae6fa5 (0x7f9f6f4e0fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #16: + 0x1acf4b8 (0x7f3bfd5524b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #17: + 0x5aee004 (0x7f3c01571004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #18: + 0x5af36b5 (0x7f3c015766b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #15: + 0x5124446 (0x7f9f6eb1e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #19: + 0xd2631e (0x7f3c1416031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #20: + 0x47def4 (0x7f3c138b7ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #21: + 0x1445a6 (0x55c1ac0545a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #16: + 0x1acf4b8 (0x7f9f6b4c94b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c1ac04da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #17: + 0x5aee004 (0x7f9f6f4e8004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #18: + 0x5af36b5 (0x7f9f6f4ed6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #19: + 0xd2631e (0x7f9f820d731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #20: + 0x47def4 (0x7f9f8182eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #23: + 0x150866 (0x55c1ac060866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #21: + 0x1445a6 (0x56374abaf5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c1ac049142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c1ac054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56374aba8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #26: PyObject_Call + 0xbc (0x55c1ac060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #23: + 0x150866 (0x56374abbb866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ac0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c1ac054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56374aba4142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ac0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #30: + 0x150582 (0x55c1ac060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ac0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #32: + 0x150582 (0x55c1ac060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ac0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56374abafa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #34: + 0x150582 (0x55c1ac060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #26: PyObject_Call + 0xbc (0x56374abbbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default0]:[rank32]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56374aba22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56374abafa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ac0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56374aba08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #30: + 0x150582 (0x56374abbb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56374aba08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #32: + 0x150582 (0x56374abbb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56374aba08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default0]:[rank32]: frame #34: + 0x150582 (0x56374abbb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c1ac04cf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c1ac05ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -n/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56374aba08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56374aba7f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: frame #38: + 0x211239 (0x55c1ac121239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site[default6]:[rank38]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c1ac04da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c1ac0493e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) --packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56374abb9c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank51]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank51]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff00df8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank38]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c1ac054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #1: + 0x5b3a23e (0x7ff047aa723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff047aa1c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #38: + 0x211239 (0x56374ac7c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff047aa1f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff047aa2fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c1ac044c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff047a57371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff047a57371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff047a57371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff047a57371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c1ac054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56374aba8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56374aba43e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff00f264189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff00f26b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56374abafa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56374ab9fc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff00f28a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #12: + 0x5adc309 (0x7ff047a49309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ac0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #13: + 0x5ae6f10 (0x7ff047a53f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56374abafa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #14: + 0x5ae6fa5 (0x7ff047a53fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #15: + 0x5124446 (0x7ff047091446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #45: + 0x150582 (0x55c1ac060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #16: + 0x1acf4b8 (0x7ff043a3c4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #17: + 0x5aee004 (0x7ff047a5b004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56374aba08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #45: + 0x150582 (0x56374abbb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #18: + 0x5af36b5 (0x7ff047a606b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #19: + 0xd2631e (0x7ff05a64a31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #20: + 0x47def4 (0x7ff059da1ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #21: + 0x1445a6 (0x557dafb055a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557dafafea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #23: + 0x150866 (0x557dafb11866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[def[default0]:[rank32]: frame #46: PyObject_Call + 0xbc (0x56374abbbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56374aba22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ault3]:[rank51]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557dafafa142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557dafb05a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #48: + 0x150582 (0x56374abbb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #26: PyObject_Call + 0xbc (0x557dafb11f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x557dafaf82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557dafb05a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x557dafaf68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #30: + 0x150582 (0x557dafb11582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x557dafaf68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #46: PyObject_Call + 0xbc (0x55c1ac060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #32: + 0x150582 (0x557dafb11582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x557dafaf68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #34: + 0x150582 (0x557dafb11582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #49: PyObject_Call + 0xbc (0x56374abbbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56374aba22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x557dafaf68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557dafafdf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557dafb0fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #38: + 0x211239 (0x557dafbd2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56374abafa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56374aba8007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557dafafea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x557dafafa3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557dafb05a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56374abb9c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #54: + 0x211239 (0x56374ac7c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557dafaf5c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557dafb05a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #55: PyObject_Call + 0x207 (0x56374abbc067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x557dafaf68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #45: + 0x150582 (0x557dafb11582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #46: PyObject_Call + 0xbc (0x557dafb11f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ac0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x557dafaf82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #48: + 0x150582 (0x557dafb11582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #49: PyObject_Call + 0xbc (0x557dafb11f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x557dafaf82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557dafb05a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557dafafe007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557dafb0fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench[default6]:[rank38]: frame #48: + 0x150582 (0x55c1ac060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56374aba22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) --cluster/bin/python3.10) -[default3]:[rank51]: frame #54: + 0x211239 (0x557dafbd2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #55: PyObject_Call + 0x207 (0x557dafb12067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x557dafaf82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #57: + 0x150582 (0x557dafb11582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x557dafaf68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #59: + 0x150582 (0x557dafb11582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #49: PyObject_Call + 0xbc (0x55c1ac060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #60: PyObject_Call + 0xbc (0x557dafb11f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x557dafaf82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #62: + 0x150582 (0x557dafb11582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #57: + 0x150582 (0x56374abbb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #63: PyObject_Call + 0xbc (0x557dafb11f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank38]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ac0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c1ac054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c1ac04d007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c1ac05ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56374aba08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #54: + 0x211239 (0x55c1ac121239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #55: PyObject_Call + 0x207 (0x55c1ac061067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ac0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #57: + 0x150582 (0x55c1ac060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ac0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #59: + 0x150582 (0x55c1ac060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #60: PyObject_Call + 0xbc (0x55c1ac060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #59: + 0x150582 (0x56374abbb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ac0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #62: + 0x150582 (0x55c1ac060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #63: PyObject_Call + 0xbc (0x55c1ac060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank32]: frame #60: PyObject_Call + 0xbc (0x56374abbbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56374aba22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #62: + 0x150582 (0x56374abbb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #63: PyObject_Call + 0xbc (0x56374abbbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank37]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f342de24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank37]: frame #1: + 0x5b3a23e (0x7f346794123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f346793bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f346793bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f346793cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f34678f1371 in /fsx/ferdinandmom[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f34678f1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f34678f1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f34678f1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f342f0fe189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f342f105610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f342f124978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #12: + 0x5adc309 (0x7f34678e3309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #13: + 0x5ae6f10 (0x7f34678edf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default5]:[rank37]: frame #14: + 0x5ae6fa5 (0x7f34678edfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.p[default5]:[rank37]: frame #15: + 0x5124446 (0x7f3466f2b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #16: + 0x1acf4b8 (0x7f34638d64b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #17: + 0x5aee004 (0x7f34678f5004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #18: + 0x5af36b5 (0x7f34678fa6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -y", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: frame #19: + 0xd2631e (0x7f347a4e431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #20: + 0x47def4 (0x7f3479c3bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #21: + 0x1445a6 (0x560a7fbad5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #22: _PyObject_MakeTpCall + 0x26b (0x560a7fba6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default5]:[rank37]: frame #23: + 0x150866 (0x560a7fbb9866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x560a7fba2142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #25: _PyFunction_Vectorcall + 0x6c (0x560a7fbada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: frame #26: PyObject_Call + 0xbc (0x560a7fbb9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x560a7fba02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #28: _PyFunction_Vectorcall + 0x6c (0x560a7fbada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x560a7fb9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: frame #30: + 0x150582 (0x560a7fbb9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x560a7fb9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #32: + 0x150582 (0x560a7fbb9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x560a7fb9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank55]: dist.recv( -[default5]:[rank37]: frame #34: + 0x150582 (0x560a7fbb9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x560a7fb9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x560a7fba5f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #37: _PyObject_Call_Prepend + 0x69 (0x560a7fbb7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: frame #38: + 0x211239 (0x560a7fc7a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #39: _PyObject_MakeTpCall + 0x26b (0x560a7fba6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x560a7fba23e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #41: _PyFunction_Vectorcall + 0x6c (0x560a7fbada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default5]:[rank37]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x560a7fb9dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #43: _PyFunction_Vectorcall + 0x6c (0x560a7fbada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x560a7fb9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #45: + 0x150582 (0x560a7fbb9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: frame #46: PyObject_Call + 0xbc (0x560a7fbb9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x560a7fba02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #48: + 0x150582 (0x560a7fbb9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #49: PyObject_Call + 0xbc (0x560a7fbb9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x560a7fba02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #51: _PyFunction_Vectorcall + 0x6c (0x560a7fbada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x560a7fba6007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -bin/python3.10) -[default5]:[rank37]: frame #53: _PyObject_Call_Prepend + 0x69 (0x560a7fbb7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #54: + 0x211239 (0x560a7fc7a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: frame #55: PyObject_Call + 0x207 (0x560a7fbba067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x560a7fba02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #57: + 0x150582 (0x560a7fbb9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x560a7fb9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #59: + 0x150582 (0x560a7fbb9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: frame #60: PyObject_Call + 0xbc (0x560a7fbb9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x560a7fba02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #62: + 0x150582 (0x560a7fbb9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: frame #63: PyObject_Call + 0xbc (0x560a7fbb9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: Traceback (most recent call last): -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: output = model(**micro_batch) -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: trainer.train(dataloader) -[default2]:[rank58]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f18022c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: frame #1: + 0x5b3a23e (0x7f183bde623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: Traceback (most recent call last): -[default7]:[rank55]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f183bde0c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f183bde0f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f183bde1fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f183bd96371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: output = model(**micro_batch) -[default7]:[rank63]: output = model(**micro_batch) -[default0]:[rank48]: Traceback (most recent call last): -[default5]:[rank61]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f183bd96371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f183bd96371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f183bd96371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f18035a3189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: sharded_logits = self.model( -[default7]:[rank63]: sharded_logits = self.model( -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: Traceback (most recent call last): -[default7]:[rank55]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f18035aa610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f18035c9978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: frame #12: + 0x5adc309 (0x7f183bd88309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: output = model(**micro_batch) -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: frame #13: + 0x5ae6f10 (0x7f183bd92f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: frame #14: + 0x5ae6fa5 (0x7f183bd92fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #15: + 0x5124446 (0x7f183b3d0446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #16: + 0x1acf4b8 (0x7f1837d7b4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: sharded_logits = self.model( -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: output = model(**micro_batch) -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: output = model(**micro_batch) -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: sharded_logits = self.model( -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: frame #17: + 0x5aee004 (0x7f183bd9a004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: frame #18: + 0x5af36b5 (0x7f183bd9f6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: output = model(**micro_batch) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: frame #19: + 0xd2631e (0x7f184e98931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #20: + 0x47def4 (0x7f184e0e0ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: frame #21: + 0x1445a6 (0x5579918215a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55799181aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: sharded_logits = self.model( -[default5]:[rank61]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: frame #23: + 0x150866 (0x55799182d866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: output = model(**micro_batch) -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default3]:[rank59]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557991816142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557991821a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: frame #26: PyObject_Call + 0xbc (0x55799182df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5579918142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank63]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557991821a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5579918128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return func(*args, **kwargs) -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: frame #30: + 0x150582 (0x55799182d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5579918128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: frame #32: + 0x150582 (0x55799182d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: pipeline_state.run_communication() -[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_s[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -tate_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank59]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors --packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank35]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank35]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe60a745897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: frame #1: + 0x5b3a23e (0x7fe64426223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fe64425cc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb5b1821897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fe64425cf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fe64425dfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe644212371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe644212371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe644212371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libto[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -rch_cpu.so) -[default3]:[rank35]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe644212371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fe60ba1f189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fe60ba26610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fe60ba45978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #12: + 0x5adc309 (0x7fe644204309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #13: [default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: recv_activation_tensor = recv_activation() -+ 0x5ae6f10 (0x7fe64420ef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5579918128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #34: + 0x150582 (0x55799182d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #1: + 0x5b3a23e (0x7fb5eb33e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: dist.recv( -[default3]:[rank35]: frame #14: + 0x5ae6fa5 (0x7fe64420efa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: frame #15: + 0x5124446 (0x7fe64384c446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #16: + 0x1acf4b8 (0x7fe6401f74b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #17: + 0x5aee004 (0x7fe644216004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #18: + 0x5af36b5 (0x7fe64421b6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5579918128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: frame #19: + 0xd2631e (0x7fe656e0531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #20: + 0x47def4 (0x7fe65655cef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: frame #21: + 0x1445a6 (0x56033bca35a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56033bc9ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #23: + 0x150866 (0x56033bcaf866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56033bc98142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb5eb338c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56033bca3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #26: PyObject_Call + 0xbc (0x56033bcaff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56033bc962b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56033bca3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56033bc948fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #30: + 0x150582 (0x56033bcaf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56033bc948fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cl[default2]:[rank58]: return func(*args, **kwargs) -uster/bin/python3.10) -[default3]:[rank35]: frame #32: + 0x150582 (0x56033bcaf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56033bc948fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #34: + 0x150582 (0x56033bcaf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56033bc948fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56033bc9bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56033bcadc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: frame #38: + 0x211239 (0x56033bd70239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56033bc9ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56033bc983e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56033bca3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557991819f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb5eb338f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56033bc93c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56033bca3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56033bc948fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #45: + 0x150582 (0x56033bcaf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #46: PyObject_Call + 0xbc (0x56033bcaff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56033bc962b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #48: + 0x150582 (0x56033bcaf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-clu[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: trainer.train(dataloader) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55799182bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #38: + 0x211239 (0x5579918ee239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: dist.recv( -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -ster/bin/python3.10) -[default3]:[rank35]: frame #49: PyObject_Call + 0xbc (0x56033bcaff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56033bc962b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56033bca3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56033bc9c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56033bcadc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #54: + 0x211239 (0x56033bd70239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb5eb339fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #55: PyObject_Call + 0x207 (0x56033bcb0067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56033bc962b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #57: + 0x150582 (0x56033bcaf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56033bc948fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #59: + 0x150582 (0x56033bcaf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55799181aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: frame #60: PyObject_Call + 0xbc (0x56033bcaff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56033bc962b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #62: + 0x150582 (0x56033bcaf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #63: PyObject_Call + 0xbc (0x56033bcaff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb5eb2ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: return func(*args, **kwargs) -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb5eb2ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f419ad43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb5eb2ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank58]: frame #1: + 0x5b3a23e (0x7f41d486023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank59]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb5eb2ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: dist.recv( -[default2]:[rank58]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f41d485ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5579918163e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: dist.recv( -[default3]:[rank59]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb5b2afb189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557991821a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f41d485af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f41d485bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb5b2b02610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557991811c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return func(*args, **kwargs) -[default7]:[rank55]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557991821a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f507984a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank58]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f41d4810371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: frame #1: + 0x5b3a23e (0x7f50b336723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb5b2b21978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f41d4810371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank63]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f50b3361c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #12: + 0x5adc309 (0x7fb5eb2e0309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f41d4810371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f50b3361f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f41d4810371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #13: + 0x5ae6f10 (0x7fb5eb2eaf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank63]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f50b3362fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f50b3317371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: dist.recv( -[default2]:[rank58]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f419c01d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f50b3317371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #14: + 0x5ae6fa5 (0x7fb5eb2eafa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f419c024610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f419c043978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank54]: pipeline_state.run_communication() -[default7]:[rank63]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f50b3317371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: return func(*args, **kwargs) -[default3]:[rank59]: frame #15: + 0x5124446 (0x7fb5ea928446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #16: + 0x1acf4b8 (0x7fb5e72d34b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6ac0f06897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: frame #12: + 0x5adc309 (0x7f41d4802309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5579918128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #45: + 0x150582 (0x55799182d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #17: + 0x5aee004 (0x7fb5eb2f2004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: frame #1: + 0x5b3a23e (0x7f6afaa2323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #13: + 0x5ae6f10 (0x7f41d480cf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f50b3317371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f507ab24189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #18: + 0x5af36b5 (0x7fb5eb2f76b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f507ab2b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6afaa1dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: frame #14: + 0x5ae6fa5 (0x7f41d480cfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #15: + 0x5124446 (0x7f41d3e4a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank63]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f507ab4a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6afaa1df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default2]:[rank58]: frame #16: + 0x1acf4b8 (0x7f41d07f54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: frame #12: + 0x5adc309 (0x7f50b3309309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #19: + 0xd2631e (0x7fb5fdee131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: frame #17: + 0x5aee004 (0x7f41d4814004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6afaa1efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank58]: frame #18: + 0x5af36b5 (0x7f41d48196b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b82784897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #13: + 0x5ae6f10 (0x7f50b3313f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6afa9d3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #19: + 0xd2631e (0x7f41e740331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #20: + 0x47def4 (0x7fb5fd638ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #21: + 0x1445a6 (0x5593130465a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #20: + 0x47def4 (0x7f41e6b5aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6afa9d3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6afa9d3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6afa9d3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: frame #21: + 0x1445a6 (0x55dc64d9e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55dc64d97a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #46: PyObject_Call + 0xbc (0x55799182df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6ac21e0189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5579918142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6ac21e7610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank59]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55931303fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: frame #48: + 0x150582 (0x55799182d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #23: + 0x150866 (0x55dc64daa866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #49: PyObject_Call + 0xbc (0x55799182df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55dc64d93142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #14: + 0x5ae6fa5 (0x7f50b3313fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank59]: frame #23: + 0x150866 (0x559313052866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6ac2206978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55dc64d9ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: dist.recv( -[default3]:[rank59]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55931303b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default7]:[rank63]: frame #15: + 0x5124446 (0x7f50b2951446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5c23b27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]: frame #12: + 0x5adc309 (0x7f6afa9c5309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5579918142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557991821a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #26: PyObject_Call + 0xbc (0x55dc64daaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559313046a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: frame #16: + 0x1acf4b8 (0x7f50af2fc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #26: PyObject_Call + 0xbc (0x559313052f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #13: + 0x5ae6f10 (0x7f6afa9cff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5593130392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559313046a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5593130378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #17: + 0x5aee004 (0x7f50b331b004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #18: + 0x5af36b5 (0x7f50b33206b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55dc64d912b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #1: + 0x5b3a23e (0x7f5c5d64423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55dc64d9ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55dc64d8f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: frame #19: + 0xd2631e (0x7f50c5f0a31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5c5d63ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #30: + 0x150582 (0x559313052582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5593130378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: frame #20: + 0x47def4 (0x7f50c5661ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #1: + 0x5b3a23e (0x7f6bbc2a123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #14: + 0x5ae6fa5 (0x7f6afa9cffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6bbc29bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #32: + 0x150582 (0x559313052582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55799181a007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #21: + 0x1445a6 (0x5633c3c875a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #30: + 0x150582 (0x55dc64daa582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5593130378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5c5d63ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #15: + 0x5124446 (0x7f6afa00d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: frame #34: + 0x150582 (0x559313052582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55dc64d8f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5c5d63ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5633c3c80a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5593130378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #32: + 0x150582 (0x55dc64daa582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55dc64d8f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #23: + 0x150866 (0x5633c3c93866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55931303ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #34: + 0x150582 (0x55dc64daa582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5633c3c7c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank55]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55799182bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559313050c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #54: + 0x211239 (0x5579918ee239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #38: + 0x211239 (0x559313113239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55dc64d8f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55dc64d96f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55931303fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: dist.recv( -[default7]:[rank63]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5633c3c87a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55dc64da8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #16: + 0x1acf4b8 (0x7f6af69b84b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #17: + 0x5aee004 (0x7f6afa9d7004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5c5d5f4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5c5d5f4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #38: + 0x211239 (0x55dc64e6b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6bbc29bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default5]:[rank61]: frame #18: + 0x5af36b5 (0x7f6afa9dc6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55dc64d97a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #55: PyObject_Call + 0x207 (0x55799182e067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55dc64d933e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #26: PyObject_Call + 0xbc (0x5633c3c93f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55931303b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6bbc29cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6bbc251371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55dc64d9ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: return func(*args, **kwargs) -[default7]:[rank55]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5579918142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #57: + 0x150582 (0x55799182d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5633c3c7a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5579918128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5633c3c87a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5c5d5f4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5c5d5f4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559313046a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #59: + 0x150582 (0x55799182d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55dc64d8ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: frame #19: + 0xd2631e (0x7f6b0d5c631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: dist.recv( -[default2]:[rank58]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55dc64d9ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55dc64d8f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5633c3c788fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559313036c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6bbc251371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6bbc251371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #20: + 0x47def4 (0x7f6b0cd1def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559313046a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5c24e01189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #21: + 0x1445a6 (0x5640607835a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6bbc251371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #45: + 0x150582 (0x55dc64daa582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5593130378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #30: + 0x150582 (0x5633c3c93582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #45: + 0x150582 (0x559313052582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #46: PyObject_Call + 0xbc (0x55dc64daaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5633c3c788fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #32: + 0x150582 (0x5633c3c93582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #46: PyObject_Call + 0xbc (0x559313052f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55dc64d912b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56406077ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5633c3c788fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #34: + 0x150582 (0x5633c3c93582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #48: + 0x150582 (0x55dc64daa582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5593130392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #23: + 0x150866 (0x56406078f866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x564060778142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #48: + 0x150582 (0x559313052582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #49: PyObject_Call + 0xbc (0x55dc64daaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5633c3c788fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #49: PyObject_Call + 0xbc (0x559313052f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5593130392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5633c3c7ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55dc64d912b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559313046a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5633c3c91c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5c24e08610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55dc64d9ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5c24e27978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55931303f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: frame #38: + 0x211239 (0x5633c3d54239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6b83a5e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6b83a65610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55dc64d97007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #60: PyObject_Call + 0xbc (0x55799182df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559313050c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #25: _PyFunction_Vectorcall + 0x6c (0x564060783a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5633c3c80a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55dc64da8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5579918142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5633c3c7c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #12: + 0x5adc309 (0x7f5c5d5e6309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #13: + 0x5ae6f10 (0x7f5c5d5f0f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #26: PyObject_Call + 0xbc (0x56406078ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5633c3c87a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: frame #54: + 0x211239 (0x559313113239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #54: + 0x211239 (0x55dc64e6b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5633c3c77c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5640607762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: frame #62: + 0x150582 (0x55799182d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank58]: frame #55: PyObject_Call + 0x207 (0x55dc64dab067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank48]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank53]: frame #14: + 0x5ae6fa5 (0x7f5c5d5f0fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default5]:[rank61]: frame #28: _PyFunction_Vectorcall + 0x6c (0x564060783a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc6e4da6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: frame #63: PyObject_Call + 0xbc (0x55799182df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55dc64d912b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #55: PyObject_Call + 0x207 (0x559313053067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5640607748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #1: + 0x5b3a23e (0x7fc71e8c323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank63]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5633c3c87a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #15: + 0x5124446 (0x7f5c5cc2e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5593130392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank49]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7effc93a7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5633c3c788fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #30: + 0x150582 (0x56406078f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: frame #57: + 0x150582 (0x559313052582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: pipeline_state.run_communication() -[default7]:[rank63]: frame #45: + 0x150582 (0x5633c3c93582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6b83a84978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #12: + 0x5adc309 (0x7f6bbc243309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5640607748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5593130378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: frame #57: + 0x150582 (0x55dc64daa582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #32: + 0x150582 (0x56406078f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #1: + 0x5b3a23e (0x7f0002ec423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #59: + 0x150582 (0x559313052582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0002ebec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #16: + 0x1acf4b8 (0x7f5c595d94b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #13: + 0x5ae6f10 (0x7f6bbc24df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #46: PyObject_Call + 0xbc (0x5633c3c93f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a61e72897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank58]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55dc64d8f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0002ebef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5633c3c7a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5640607748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #14: + 0x5ae6fa5 (0x7f6bbc24dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #59: + 0x150582 (0x55dc64daa582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0002ebffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0002e74371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #60: PyObject_Call + 0xbc (0x55dc64daaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #60: PyObject_Call + 0xbc (0x559313052f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #34: + 0x150582 (0x56406078f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #17: + 0x5aee004 (0x7f5c5d5f8004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #1: + 0x5b3a23e (0x7f2a9b98f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #48: + 0x150582 (0x5633c3c93582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2a9b989c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2a9b989f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55dc64d912b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5640607748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0002e74371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0002e74371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #49: PyObject_Call + 0xbc (0x5633c3c93f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fc71e8bdc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56406077bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0002e74371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56406078dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5593130392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #62: + 0x150582 (0x55dc64daa582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #18: + 0x5af36b5 (0x7f5c5d5fd6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #19: + 0xd2631e (0x7f5c701e731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #38: + 0x211239 (0x564060850239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fc71e8bdf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fc71e8befd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5633c3c7a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #20: + 0x47def4 (0x7f5c6f93eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #63: PyObject_Call + 0xbc (0x55dc64daaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #21: + 0x1445a6 (0x559d5ecb05a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56406077ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank63]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5633c3c87a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7effca681189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7effca688610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #62: + 0x150582 (0x559313052582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f2a9b98afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5640607783e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2a9b93f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #15: + 0x5124446 (0x7f6bbb88b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5633c3c80007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #16: + 0x1acf4b8 (0x7f6bb82364b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5633c3c91c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #41: _PyFunction_Vectorcall + 0x6c (0x564060783a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #63: PyObject_Call + 0xbc (0x559313052f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559d5eca9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank54]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc71e873371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc71e873371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #54: + 0x211239 (0x5633c3d54239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc71e873371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: dist.recv( -[default5]:[rank61]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x564060773c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: frame #55: PyObject_Call + 0x207 (0x5633c3c94067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc71e873371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #43: _PyFunction_Vectorcall + 0x6c (0x564060783a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2a9b93f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5640607748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #45: + 0x150582 (0x56406078f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fc6e6080189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5633c3c7a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #46: PyObject_Call + 0xbc (0x56406078ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #23: + 0x150866 (0x559d5ecbc866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559d5eca5142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5640607762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #57: + 0x150582 (0x5633c3c93582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5633c3c788fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: frame #59: + 0x150582 (0x5633c3c93582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank52]: frame #17: + 0x5aee004 (0x7f6bbc255004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #60: PyObject_Call + 0xbc (0x5633c3c93f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #18: + 0x5af36b5 (0x7f6bbc25a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5633c3c7a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #62: + 0x150582 (0x5633c3c93582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #19: + 0xd2631e (0x7f6bcee4431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fc6e6087610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #63: PyObject_Call + 0xbc (0x5633c3c93f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559d5ecb0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #48: + 0x150582 (0x56406078f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7effca6a7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #12: + 0x5adc309 (0x7f0002e66309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank49]: frame #13: + 0x5ae6f10 (0x7f0002e70f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2a9b93f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #49: PyObject_Call + 0xbc (0x56406078ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5640607762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #51: _PyFunction_Vectorcall + 0x6c (0x564060783a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2a9b93f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fc6e60a6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56406077c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56406078dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #54: + 0x211239 (0x564060850239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #55: PyObject_Call + 0x207 (0x564060790067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5640607762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #57: + 0x150582 (0x56406078f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5640607748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #59: + 0x150582 (0x56406078f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #60: PyObject_Call + 0xbc (0x56406078ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5640607762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #62: + 0x150582 (0x56406078f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #63: PyObject_Call + 0xbc (0x56406078ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank54]: frame #12: + 0x5adc309 (0x7fc71e865309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank53]: frame #26: PyObject_Call + 0xbc (0x559d5ecbcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559d5eca32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559d5ecb0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc651db5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: frame #20: + 0x47def4 (0x7f6bce59bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #21: + 0x1445a6 (0x5613e36b75a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #13: + 0x5ae6f10 (0x7fc71e86ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2a6314c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2a63153610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2a63172978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #12: + 0x5adc309 (0x7f2a9b931309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559d5eca18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #30: + 0x150582 (0x559d5ecbc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559d5eca18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5613e36b0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #14: + 0x5ae6fa5 (0x7fc71e86ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #15: + 0x5124446 (0x7fc71dead446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #16: + 0x1acf4b8 (0x7fc71a8584b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #32: + 0x150582 (0x559d5ecbc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559d5eca18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #13: + 0x5ae6f10 (0x7f2a9b93bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #23: + 0x150866 (0x5613e36c3866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #34: + 0x150582 (0x559d5ecbc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #17: + 0x5aee004 (0x7fc71e877004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #18: + 0x5af36b5 (0x7fc71e87c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #14: + 0x5ae6fa5 (0x7f0002e70fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559d5eca18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559d5eca8f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5613e36ac142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5613e36b7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #14: + 0x5ae6fa5 (0x7f2a9b93bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #1: + 0x5b3a23e (0x7fc68b8d223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fc68b8ccc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559d5ecbac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #38: + 0x211239 (0x559d5ed7d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559d5eca9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #26: PyObject_Call + 0xbc (0x5613e36c3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #15: + 0x5124446 (0x7f00024ae446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #16: + 0x1acf4b8 (0x7efffee594b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #17: + 0x5aee004 (0x7f0002e78004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559d5eca53e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559d5ecb0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fc68b8ccf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #15: + 0x5124446 (0x7f2a9af79446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #16: + 0x1acf4b8 (0x7f2a979244b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559d5eca0c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559d5ecb0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fc68b8cdfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #19: + 0xd2631e (0x7fc73146631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #20: + 0x47def4 (0x7fc730bbdef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5613e36aa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5613e36b7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #18: + 0x5af36b5 (0x7f0002e7d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #19: + 0xd2631e (0x7f0015a6731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5613e36a88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #30: + 0x150582 (0x5613e36c3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #17: + 0x5aee004 (0x7f2a9b943004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5613e36a88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559d5eca18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #20: + 0x47def4 (0x7f00151beef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #21: + 0x1445a6 (0x5629c11c85a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc68b882371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc68b882371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc68b882371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5629c11c1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #23: + 0x150866 (0x5629c11d4866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5629c11bd142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #45: + 0x150582 (0x559d5ecbc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #46: PyObject_Call + 0xbc (0x559d5ecbcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #18: + 0x5af36b5 (0x7f2a9b9486b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #19: + 0xd2631e (0x7f2aae53231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5629c11c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc68b882371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #32: + 0x150582 (0x5613e36c3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #20: + 0x47def4 (0x7f2aadc89ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #21: + 0x1445a6 (0x55b6c34ff5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b6c34f8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559d5eca32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #48: + 0x150582 (0x559d5ecbc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #49: PyObject_Call + 0xbc (0x559d5ecbcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559d5eca32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #23: + 0x150866 (0x55b6c350b866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b6c34f4142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #21: + 0x1445a6 (0x562a3b0165a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fc65308f189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fc653096610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b6c34ffa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559d5ecb0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562a3b00fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fc6530b5978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #26: PyObject_Call + 0xbc (0x55b6c350bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6c34f22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #26: PyObject_Call + 0xbc (0x5629c11d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5613e36a88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #34: + 0x150582 (0x5613e36c3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #23: + 0x150866 (0x562a3b022866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x562a3b00b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559d5eca9007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c11bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5629c11c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #12: + 0x5adc309 (0x7fc68b874309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562a3b016a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5613e36a88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5613e36aff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #13: + 0x5ae6f10 (0x7fc68b87ef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559d5ecbac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #54: + 0x211239 (0x559d5ed7d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #55: PyObject_Call + 0x207 (0x559d5ecbd067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559d5eca32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #57: + 0x150582 (0x559d5ecbc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559d5eca18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #59: + 0x150582 (0x559d5ecbc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #60: PyObject_Call + 0xbc (0x559d5ecbcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559d5eca32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5629c11b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #14: + 0x5ae6fa5 (0x7fc68b87efa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #15: + 0x5124446 (0x7fc68aebc446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #62: + 0x150582 (0x559d5ecbc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #63: PyObject_Call + 0xbc (0x559d5ecbcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b6c34ffa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #16: + 0x1acf4b8 (0x7fc6878674b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #17: + 0x5aee004 (0x7fc68b886004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5613e36c1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #18: + 0x5af36b5 (0x7fc68b88b6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #26: PyObject_Call + 0xbc (0x562a3b022f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #38: + 0x211239 (0x5613e3784239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b6c34f08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #30: + 0x150582 (0x55b6c350b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x562a3b0092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b6c34f08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #30: + 0x150582 (0x5629c11d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5613e36b0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5613e36ac3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562a3b016a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #19: + 0xd2631e (0x7fc69e47531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #20: + 0x47def4 (0x7fc69dbccef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5629c11b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5613e36b7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #21: + 0x1445a6 (0x55d936c265a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d936c1fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x562a3b0078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5613e36a7c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5613e36b7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #23: + 0x150866 (0x55d936c32866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #32: + 0x150582 (0x5629c11d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5629c11b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #30: + 0x150582 (0x562a3b022582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d936c1b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d936c26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x562a3b0078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5613e36a88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #32: + 0x150582 (0x55b6c350b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b6c34f08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #34: + 0x150582 (0x55b6c350b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #45: + 0x150582 (0x5613e36c3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #34: + 0x150582 (0x5629c11d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #26: PyObject_Call + 0xbc (0x55d936c32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #46: PyObject_Call + 0xbc (0x5613e36c3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5613e36aa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b6c34f08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #48: + 0x150582 (0x5613e36c3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5629c11b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5629c11c0f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b6c34f7f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #49: PyObject_Call + 0xbc (0x5613e36c3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5613e36aa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5613e36b7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b6c3509c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5629c11d2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5613e36b0007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5613e36c1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #38: + 0x211239 (0x5629c1295239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #38: + 0x211239 (0x55b6c35cc239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #32: + 0x150582 (0x562a3b022582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5629c11c1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5629c11bd3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x562a3b0078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #54: + 0x211239 (0x5613e3784239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5629c11c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5629c11b8c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #55: PyObject_Call + 0x207 (0x5613e36c4067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #34: + 0x150582 (0x562a3b022582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x562a3b0078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5629c11c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5629c11b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b6c34f8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b6c34f43e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5613e36aa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d936c192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d936c26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562a3b00ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562a3b020c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #57: + 0x150582 (0x5613e36c3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #45: + 0x150582 (0x5629c11d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #46: PyObject_Call + 0xbc (0x5629c11d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d936c178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #38: + 0x211239 (0x562a3b0e3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562a3b00fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x562a3b00b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b6c34ffa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c11bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #48: + 0x150582 (0x5629c11d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b6c34efc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b6c34ffa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5613e36a88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #30: + 0x150582 (0x55d936c32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d936c178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #32: + 0x150582 (0x55d936c32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #59: + 0x150582 (0x5613e36c3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #60: PyObject_Call + 0xbc (0x5613e36c3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d936c178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #49: PyObject_Call + 0xbc (0x5629c11d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c11bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #34: + 0x150582 (0x55d936c32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b6c34f08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #45: + 0x150582 (0x55b6c350b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5613e36aa2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5629c11c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5629c11c1007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d936c178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d936c1ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562a3b016a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #62: + 0x150582 (0x5613e36c3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #63: PyObject_Call + 0xbc (0x5613e36c3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank54]: frame #46: PyObject_Call + 0xbc (0x55b6c350bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5629c11d2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #54: + 0x211239 (0x5629c1295239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6c34f22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #48: + 0x150582 (0x55b6c350b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562a3b006c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562a3b016a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #55: PyObject_Call + 0x207 (0x5629c11d5067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #49: PyObject_Call + 0xbc (0x55b6c350bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6c34f22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c11bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #57: + 0x150582 (0x5629c11d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d936c30c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x562a3b0078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #45: + 0x150582 (0x562a3b022582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b6c34ffa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b6c34f8007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #46: PyObject_Call + 0xbc (0x562a3b022f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #38: + 0x211239 (0x55d936cf3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b6c3509c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x562a3b0092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d936c1fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #48: + 0x150582 (0x562a3b022582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5629c11b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #59: + 0x150582 (0x5629c11d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #49: PyObject_Call + 0xbc (0x562a3b022f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x562a3b0092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562a3b016a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562a3b00f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #54: + 0x211239 (0x55b6c35cc239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #60: PyObject_Call + 0xbc (0x5629c11d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5629c11bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #55: PyObject_Call + 0x207 (0x55b6c350c067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6c34f22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d936c1b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d936c26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d936c16c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #57: + 0x150582 (0x55b6c350b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d936c26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b6c34f08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #59: + 0x150582 (0x55b6c350b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #60: PyObject_Call + 0xbc (0x55b6c350bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d936c178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #62: + 0x150582 (0x5629c11d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #63: PyObject_Call + 0xbc (0x5629c11d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562a3b020c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #45: + 0x150582 (0x55d936c32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #54: + 0x211239 (0x562a3b0e3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #55: PyObject_Call + 0x207 (0x562a3b023067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #46: PyObject_Call + 0xbc (0x55d936c32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d936c192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: frame #48: + 0x150582 (0x55d936c32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6c34f22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #62: + 0x150582 (0x55b6c350b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #49: PyObject_Call + 0xbc (0x55d936c32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d936c192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #63: PyObject_Call + 0xbc (0x55b6c350bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x562a3b0092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #57: + 0x150582 (0x562a3b022582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank48]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x562a3b0078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #59: + 0x150582 (0x562a3b022582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #60: PyObject_Call + 0xbc (0x562a3b022f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x562a3b0092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #62: + 0x150582 (0x562a3b022582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #63: PyObject_Call + 0xbc (0x562a3b022f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d936c26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d936c1f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d936c30c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #54: + 0x211239 (0x55d936cf3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #55: PyObject_Call + 0x207 (0x55d936c33067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d936c192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #57: + 0x150582 (0x55d936c32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d936c178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #59: + 0x150582 (0x55d936c32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #60: PyObject_Call + 0xbc (0x55d936c32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d936c192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #62: + 0x150582 (0x55d936c32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #63: PyObject_Call + 0xbc (0x55d936c32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: . This may indicate a possible application crash on rank 0 or a network set up issue. -E0702 21:47:08.161000 139896234325824 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1737002) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_21:47:08 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1737003) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_21:47:08 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1737004) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_21:47:08 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1737005) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_21:47:08 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1737006) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_21:47:08 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1737007) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_21:47:08 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1737008) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_21:47:08 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1737009) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_21:47:08 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1737002) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -W0702 21:47:12.033000 139984724735744 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3853684_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:12.278000 139673051064064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-153.ec2.internal_1382157_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:12.547000 140391475324672 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-162-233.ec2.internal_1360550_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:12.582000 140463420364544 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_843383_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:12.632000 139770468677376 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-103.ec2.internal_830490_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:12.709000 140041302849280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3726035_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:12.875000 140100434835200 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_1103218_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:13.014000 140046963582784 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3726114 closing signal SIGTERM -W0702 21:47:13.014000 140046963582784 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3726115 closing signal SIGTERM -W0702 21:47:13.014000 140046963582784 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3726116 closing signal SIGTERM -W0702 21:47:13.016000 140046963582784 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3726117 closing signal SIGTERM -W0702 21:47:13.016000 140046963582784 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3726118 closing signal SIGTERM -W0702 21:47:13.016000 140046963582784 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3726119 closing signal SIGTERM -W0702 21:47:13.018000 140046963582784 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3726120 closing signal SIGTERM -W0702 21:47:13.018000 140046963582784 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3726121 closing signal SIGTERM -W0702 21:47:13.029000 140397136058176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1360628 closing signal SIGTERM -W0702 21:47:13.030000 140397136058176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1360629 closing signal SIGTERM -W0702 21:47:13.030000 140397136058176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1360630 closing signal SIGTERM -W0702 21:47:13.031000 140397136058176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1360631 closing signal SIGTERM -W0702 21:47:13.031000 140397136058176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1360632 closing signal SIGTERM -W0702 21:47:13.032000 140397136058176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1360633 closing signal SIGTERM -W0702 21:47:13.032000 140397136058176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1360634 closing signal SIGTERM -W0702 21:47:13.033000 140397136058176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1360635 closing signal SIGTERM -W0702 21:47:13.046000 139990385469248 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3853764 closing signal SIGTERM -W0702 21:47:13.046000 139990385469248 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3853765 closing signal SIGTERM -W0702 21:47:13.047000 139990385469248 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3853766 closing signal SIGTERM -W0702 21:47:13.047000 139990385469248 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3853767 closing signal SIGTERM -W0702 21:47:13.048000 139990385469248 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3853768 closing signal SIGTERM -W0702 21:47:13.048000 139990385469248 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3853769 closing signal SIGTERM -W0702 21:47:13.049000 139990385469248 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3853770 closing signal SIGTERM -W0702 21:47:13.050000 139990385469248 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3853771 closing signal SIGTERM -E0702 21:47:13.163000 140106095568704 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1103296) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 21:47:13.169000 139776129410880 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 830568) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0702 21:47:13.172000 140106095568704 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1103218_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0702 21:47:13.173000 139678711797568 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1382236) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 21:47:13.176000 140469081098048 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 843461) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0702 21:47:13.175000 139776129410880 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_830490_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:13.178000 139678711797568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1382157_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:13.182000 140469081098048 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_843383_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:13.200000 140106095568704 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1103218_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:13.205000 139678711797568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1382157_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:13.203000 139776129410880 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_830490_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:13.211000 140469081098048 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_843383_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:13.228000 140106095568704 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1103218_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1103297) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-78.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1103298) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1103299) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1103300) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1103301) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-78.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1103302) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1103303) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1103296) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -W0702 21:47:13.233000 139678711797568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1382157_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -W0702 21:47:13.235000 139776129410880 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_830490_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-153.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 1382237) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-153.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 1382238) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-153.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 1382239) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 1382240) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-153.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 1382241) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-153.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 1382242) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-153.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 1382243) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 1382236) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 830569) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 830570) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 830571) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 830572) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 830573) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 830574) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 830575) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_21:47:13 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 830568) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -W0702 21:47:13.244000 140469081098048 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_843383_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_21:47:13 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 843462) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_21:47:13 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 843463) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_21:47:13 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 843464) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_21:47:13 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 843465) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_21:47:13 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 843466) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_21:47:13 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 843467) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_21:47:13 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 843468) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_21:47:13 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 843461) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -W0702 21:47:15.184000 140397136058176 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1360550_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:15.196000 140397136058176 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1360550_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-162-233: task 4: Exited with exit code 1 -W0702 21:47:16.777000 140046963582784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3726035_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:16.789000 140046963582784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3726035_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0702 21:47:17.037000 139984724735744 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3853684_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -W0702 21:47:17.088000 139990385469248 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3853684_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0702 21:47:17.100000 139990385469248 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3853684_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-1024/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/bench.slurm deleted file mode 100644 index 0b5572f77263384b6feb203ef5b2e90427feb8c6..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/config.yaml deleted file mode 100644 index 5acb965d10e07194807b8ab044342d23e17cd993..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 8 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 128 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/log.out deleted file mode 100644 index 18ecfa7da31b19f8323699a60ae87a833b568453..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/log.out +++ /dev/null @@ -1,6285 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:42:04 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:42:07.708000 140380479002432 torch/distributed/run.py:757] -W0703 09:42:07.708000 140380479002432 torch/distributed/run.py:757] ***************************************** -W0703 09:42:07.708000 140380479002432 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:07.708000 140380479002432 torch/distributed/run.py:757] ***************************************** -W0703 09:42:09.895000 139841750644544 torch/distributed/run.py:757] -W0703 09:42:09.895000 139841750644544 torch/distributed/run.py:757] ***************************************** -W0703 09:42:09.895000 139841750644544 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:09.895000 139841750644544 torch/distributed/run.py:757] ***************************************** -W0703 09:42:09.897000 139791769425728 torch/distributed/run.py:757] -W0703 09:42:09.897000 139791769425728 torch/distributed/run.py:757] ***************************************** -W0703 09:42:09.897000 139791769425728 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:09.897000 139791769425728 torch/distributed/run.py:757] ***************************************** -W0703 09:42:10.212000 140004522362688 torch/distributed/run.py:757] -W0703 09:42:10.212000 140004522362688 torch/distributed/run.py:757] ***************************************** -W0703 09:42:10.212000 140004522362688 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:10.212000 140004522362688 torch/distributed/run.py:757] ***************************************** -W0703 09:42:10.343000 139794222888768 torch/distributed/run.py:757] -W0703 09:42:10.343000 139794222888768 torch/distributed/run.py:757] ***************************************** -W0703 09:42:10.343000 139794222888768 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:10.343000 139794222888768 torch/distributed/run.py:757] ***************************************** -W0703 09:42:10.946000 139687632484160 torch/distributed/run.py:757] -W0703 09:42:10.946000 139687632484160 torch/distributed/run.py:757] ***************************************** -W0703 09:42:10.946000 139687632484160 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:10.946000 139687632484160 torch/distributed/run.py:757] ***************************************** -W0703 09:42:11.176000 140060554409792 torch/distributed/run.py:757] -W0703 09:42:11.176000 140060554409792 torch/distributed/run.py:757] ***************************************** -W0703 09:42:11.176000 140060554409792 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:11.176000 140060554409792 torch/distributed/run.py:757] ***************************************** -W0703 09:42:11.512000 140540360554304 torch/distributed/run.py:757] -W0703 09:42:11.512000 140540360554304 torch/distributed/run.py:757] ***************************************** -W0703 09:42:11.512000 140540360554304 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:11.512000 140540360554304 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:42:36 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Config: -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: run='%date_%jobid', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: seed=42, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: step=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: consumed_train_samples=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: benchmark_csv_path=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pp=2, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp=32, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pp_engine=, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp_mode=, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: expert_parallel_size=1), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: eos_token_id=2, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_act='silu', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_size=2048, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: initializer_range=0.02, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: intermediate_size=4096, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: is_llama_config=True, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: max_position_embeddings=4096, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_attention_heads=32, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_hidden_layers=24, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_key_value_heads=32, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pad_token_id=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pretraining_tp=1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_scaling=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_theta=10000.0, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tie_word_embeddings=True, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: use_cache=True, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: vocab_size=50272), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer_revision=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer_max_length=None), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoint_interval=100000, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: save_initial_state=False, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: log_level_replica='info', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: train_steps=20, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: micro_batch_size=128, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: batch_accumulation_per_replica=8, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: val_check_interval=-1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: limit_val_batches=0, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: limit_test_batches=0), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: adam_beta1=0.9, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: adam_beta2=0.95, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: name='adamW'), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: zero_stage=1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: weight_decay=0.01, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: clip_grad=1.0, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_warmup_steps=1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_warmup_style='linear', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_style='linear', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_steps=19, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: start_training_step=1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hf_dataset_splits='train', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: text_column_name='text'), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: seed=42, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_loading_workers=0))], -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128')), -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lighteval=None) -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Model Config: -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: eos_token_id=2, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_act='silu', -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_size=2048, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: initializer_range=0.02, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: intermediate_size=4096, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: is_llama_config=True, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: max_position_embeddings=4096, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_attention_heads=32, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_hidden_layers=24, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_key_value_heads=32, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pad_token_id=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pretraining_tp=1, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_scaling=None, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_theta=10000.0, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tie_word_embeddings=True, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: use_cache=True, -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: vocab_size=50272) -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Building model.. -[default0]:07/03/2024 09:42:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Setting PP block ranks... -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-56]: No checkpoint path provided. -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-56]: No checkpoint path provided. -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-56]: No checkpoint path provided. -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-56]: No checkpoint path provided. -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-56]: No checkpoint path provided. -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-56]: No checkpoint path provided. -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-56]: No checkpoint path provided. -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: No checkpoint path provided. -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: No checkpoint path provided. -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: No checkpoint path provided. -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: No checkpoint path provided. -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: No checkpoint path provided. -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: No checkpoint path provided. -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: No checkpoint path provided. -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: No checkpoint path provided. -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-56]: No checkpoint path provided. -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: No checkpoint path provided. -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: No checkpoint path provided. -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: No checkpoint path provided. -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: No checkpoint path provided. -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: No checkpoint path provided. -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: No checkpoint path provided. -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: No checkpoint path provided. -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Parametrizing model parameters using StandardParametrizator -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: No checkpoint path provided. -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: No checkpoint path provided. -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: No checkpoint path provided. -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: No checkpoint path provided. -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=6|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=6|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=6|ip-26-0-170-31]: No checkpoint path provided. -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=7|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=7|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=7|ip-26-0-170-31]: No checkpoint path provided. -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=5|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=5|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=5|ip-26-0-170-31]: No checkpoint path provided. -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=4|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=4|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=4|ip-26-0-170-31]: No checkpoint path provided. -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=23|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=23|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=23|ip-26-0-169-239]: No checkpoint path provided. -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=25|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=25|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=25|ip-26-0-169-247]: No checkpoint path provided. -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=24|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=24|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=24|ip-26-0-169-247]: No checkpoint path provided. -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=27|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=27|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=27|ip-26-0-169-247]: No checkpoint path provided. -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=31|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=21|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=21|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=31|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=31|ip-26-0-169-247]: No checkpoint path provided. -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=21|ip-26-0-169-239]: No checkpoint path provided. -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=22|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=22|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=22|ip-26-0-169-239]: No checkpoint path provided. -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=20|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=20|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=16|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=16|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=16|ip-26-0-169-239]: No checkpoint path provided. -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=20|ip-26-0-169-239]: No checkpoint path provided. -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=18|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=26|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=17|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=17|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=17|ip-26-0-169-239]: No checkpoint path provided. -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=18|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=26|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=18|ip-26-0-169-239]: No checkpoint path provided. -[default2]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=26|ip-26-0-169-247]: No checkpoint path provided. -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=28|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=28|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=28|ip-26-0-169-247]: No checkpoint path provided. -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=19|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=19|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=19|ip-26-0-169-239]: No checkpoint path provided. -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: No checkpoint path provided. -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=29|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=29|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=29|ip-26-0-169-247]: No checkpoint path provided. -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:42:55 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=30|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=30|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:42:55 [INFO|DP=0|PP=0|TP=30|ip-26-0-169-247]: No checkpoint path provided. -[default0]:07/03/2024 09:42:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:42:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:42:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 09:42:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:42:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Using `datasets` library -[default0]:07/03/2024 09:42:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:42:58 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:42:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:42:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:42:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: -[default0]:07/03/2024 09:42:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Start training] datetime: 2024-07-03 09:42:59.717786 | mbs: 128 | grad_accum: 8 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:42:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:42:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default6]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=6|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=30|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=4|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=12|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=10|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=13|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=3|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=2|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=11|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=14|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=1|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=0|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=3|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=26|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=7|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=6|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=1|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=5|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=4|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=5|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=2|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=23|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=24|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=27|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=22|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=31|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=20|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=25|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=21|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=16|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=24|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=18|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=17|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=26|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=18|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=28|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=19|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=23|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=17|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=20|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=28|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=27|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=25|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=19|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=29|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=7|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=31|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=29|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=22|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=16|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=8|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:43:00 [WARNING|DP=0|PP=0|TP=15|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:42:59 [WARNING|DP=0|PP=0|TP=9|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:42:59 [WARNING|DP=0|PP=1|TP=21|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:43:05 [WARNING|DP=0|PP=1|TP=30|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/na[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -notron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/l[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -ib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return fo[default0]:[rank8]: return self._call_impl(*args, **kwargs) -rward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.85 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management ([default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank8]: output = self.o_proj(attention_output) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: return row_linear( -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank10]: output = self.o_proj(attention_output) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 493.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: output = self.o_proj(attention_output) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 601.94 MiB is free. Including non-PyTorch memory, this process has 78.73 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5[default7]:[rank15]: Traceback (most recent call last): -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: trainer.train(dataloader) -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: sharded_logits = self.model( -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 601.94 MiB is free. Including non-PyTorch memory, this process has 78.73 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 601.94 MiB is free. Including non-PyTorch memory, this process has 78.73 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default5]:[rank13]: Traceback (most recent call last): -[default6]:[rank14]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank14]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: output = model(**micro_batch) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: sharded_logits = self.model( -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: sharded_logits = self.model( -[default5]:[rank13]: sharded_logits = self.model( -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[def[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -ault2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn[default4]:[rank12]: return self._call_impl(*args, **kwargs) -/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: output = self.o_proj(attention_output) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank2[default5]:[rank13]: return self._call_impl(*args, **kwargs) -6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 673.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank24]: output = self.o_proj(attention_output) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_clust[default5]:[rank13]: output = self.pp_block(**new_kwargs) -er/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank24]: return row_linear( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: output = self.o_proj(attention_output) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank28]: output = self.o_proj(attention_output) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 673.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default5]:[rank13]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 493.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinan[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -dmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 601.94 MiB is free. Including non-PyTorch memory, this process has 78.73 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.36 GiB is free. Including non-PyTorch memory, this process has 77.96 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default0]:[rank0]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 493.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank0]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: output = self.o_proj(attention_output) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank22]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank22]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default4]:[rank20]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 601.94 MiB is free. Including non-PyTorch memory, this process has 78.73 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: output = self.o_proj(attention_output) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.85 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/na[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 601.94 MiB is free. Including non-PyTorch memory, this process has 78.73 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -notron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nano[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -tron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default0]:[rank16]: trainer.train(dataloader) -]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/minif[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -orge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: trainer.train(dataloader) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_para[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -llel/functional.py", line 474, in row_linear -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.36 GiB is free. Including non-PyTorch memory, this process has 77.96 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/na[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -notron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nano[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -tron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in [default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -forward -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: output = self.o_proj(attention_output) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: Fi[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -le "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank4]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.85 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: output = model(**micro_batch) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank3]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: output = model(**micro_batch) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: return row_linear( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.36 GiB is free. Including non-PyTorch memory, this process has 77.96 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, [default0]:[rank16]: return forward_call(*args, **kwargs) -in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: [default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward - return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: sharded_logits = self.model( -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.36 GiB is free. Including non-PyTorch memory, this process has 77.96 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default7]:[rank23]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank27]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*arg[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -s, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank16]: output = self.o_proj(attention_output) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: sharded_logits = self.model( -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank23]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank17]: output = self.o_proj(attention_output) -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 493.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 493.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster[default1]:[rank25]: output = self.o_proj(attention_output) -/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl - 631, in forward -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: output = self.o_proj(attention_output) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[defau[default1]:[rank25]: return self._call_impl(*args, **kwargs) -lt3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 493.94 MiB is free. Including non-PyTorch memory, this process has 78.84 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: output = self.o_proj(attention_output) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default1]:[rank25]: return row_linear( -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default7]:[rank31]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank29]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -n/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear - -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 861.94 MiB is free. Including non-PyTorch memory, this process has 78.48 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/py[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -thon3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank18]: output = self.o_proj(attention_output) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: trainer.train(dataloader) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 601.94 MiB is free. Including non-PyTorch memory, this process has 78.73 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 861.94 MiB is free. Including non-PyTorch memory, this process has 78.48 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank31]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank30]: output = self.o_proj(attention_output) -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return row_linear( -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 673.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -n/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl - -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank21]: output = self.o_proj(attention_output) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 493.94 MiB is free. Including non-PyTorch memory, this proces[default7]:[rank31]: return self._call_impl(*args, **kwargs) -s has 78.84 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank29]: output = self.o_proj(attention_output) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: output = self.o_proj(attention_output) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return row_linear( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 861.94 MiB is free. Including non-PyTorch memory, this process has 78.48 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 861.94 MiB is free. Including non-PyTorch memory, this process has 78.48 GiB memory in use. Of the allocated memory 68.85 GiB is allocated by PyTorch, and 78.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank42]: Traceback (most recent call last): -[default5]:[rank45]: Traceback (most recent call last): -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: trainer.train(dataloader) -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: output = model(**micro_batch) -[default1]:[rank41]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default7]:[rank47]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default7]:[rank47]: sharded_logits = self.model( -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: trainer.train(dataloader) -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default0]:[rank40]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: dist.recv( -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank42]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return func(*args, **kwargs) -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default5]:[rank45]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank45]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank41]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0477600897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: frame #1: + 0x5b3a23e (0x7f04b111d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f04b1117c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f04b1117f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8fe8fe7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #1: + 0x5b3a23e (0x7f9022b0423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f9022afec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default2]:[rank42]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f9022afef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: return func(*args, **kwargs) -[default1]:[rank41]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank45]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f04b1118fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f04b10cd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f9022afffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f04b10cd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9022ab4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9022ab4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9022ab4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a820b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9022ab4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f04b10cd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f04b10cd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f8fea2c1189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f8fea2c8610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb7aaa33897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default2]:[rank42]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f8fea2e7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: pipeline_state.run_communication() -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[def[default5]:[rank45]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f04788da189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f04788e1610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -ault1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from[default5]:[rank45]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0478900978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag[default5]:[rank45]: frame #12: + 0x5adc309 (0x7f04b10bf309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #12: + 0x5adc309 (0x7f9022aa6309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] [default7]:[rank47]: frame #1: + 0x5b3a23e (0x7fb7e455023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank33]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank33]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1d11395897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb7e454ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #1: + 0x5b3a23e (0x7f1d4aeb223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f1d4aeacc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1d4aeacf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1d4aeadfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1d4ae62371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1d4ae62371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1d4ae62371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1d4ae62371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f1d1266f189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f1d12676610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f1d12695978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #12: + 0x5adc309 (0x7f1d4ae54309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #13: + 0x5ae6f10 (0x7f1d4ae5ef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #14: + 0x5ae6fa5 (0x7f1d4ae5efa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #15: + 0x5124446 (0x7f1d4a49c446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #16: + 0x1acf4b8 (0x7f1d46e474b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #17: + 0x5aee004 (0x7f1d4ae66004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #18: + 0x5af36b5 (0x7f1d4ae6b6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #19: + 0xd2631e (0x7f1d5da5531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #20: + 0x47def4 (0x7f1d5d1acef4 in /fsx/fe[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -rdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #21: + 0x1445a6 (0x55b9cf0545a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b9cf04da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #23: + 0x150866 (0x55b9cf060866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b9cf049142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b9cf054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #26: PyObject_Call + 0xbc (0x55b9cf060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #27: [default5]:[rank45]: frame #13: + 0x5ae6f10 (0x7f04b10c9f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -_PyEval_EvalFrameDefault + 0x2d83 (0x55b9cf0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b9cf054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b9cf0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #30: + 0x150582 (0x55b9cf060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b9cf0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #32: + 0x150582 (0x55b9cf060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b9cf0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.1[default1]:[rank41]: frame #1: + 0x5b3a23e (0x7f4abbbcf23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -0) -[default1]:[rank33]: frame #34: + 0x150582 (0x55b9cf060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b9cf0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b9cf04cf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b9cf05ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #38: + 0x211239 (0x55b9cf121239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b9cf04da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b9cf0493e6 in /fsx/ferdinandmom/miniforge3/envs[default1]:[rank41]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4abbbc9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #13: + 0x5ae6f10 (0x7f9022ab0f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b9cf054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b9cf044c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b9cf054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b9cf0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #45: + 0x150582 (0x55b9cf060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #46: PyObject_Call + 0xbc (0x55b9cf060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b9cf0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-clu[default5]:[rank45]: frame #14: + 0x5ae6fa5 (0x7f04b10c9fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ster/bin/python3.10) -[default1]:[rank41]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4abbbc9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #48: + 0x150582 (0x55b9cf060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #49: PyObject_Call + 0xbc (0x55b9cf060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b9cf0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b9cf054a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b9cf04d007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4abbbcafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b9cf05ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #54: + 0x211239 (0x55b9cf121239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #55: PyObject_Call + 0x207 (0x55b9cf061067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4abbb7f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b9cf0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #57: + 0x150582 (0x55b9cf060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b9cf0458fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4abbb7f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #59: + 0x150582 (0x55b9cf060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #60: PyObject_Call + 0xbc (0x55b9cf060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b9cf0472b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #62: + 0x150582 (0x55b9cf060582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #63: PyObject_Call + 0xbc (0x55b9cf060f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank41]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4abbb7f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4abbb7f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f4a8338c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default1]:[rank41]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4a83393610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: frame #14: + 0x5ae6fa5 (0x7f9022ab0fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb7e454af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/py[default5]:[rank45]: frame #15: + 0x5124446 (0x7f04b0707446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -thon3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: Traceback (most recent call last): -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4a833b2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #12: + 0x5adc309 (0x7f4abbb71309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: frame #13: + 0x5ae6f10 (0x7f4abbb7bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: frame #16: + 0x1acf4b8 (0x7f04ad0b24b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #17: + 0x5aee004 (0x7f04b10d1004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb7e454bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb7e4500371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: frame #14: + 0x5ae6fa5 (0x7f4abbb7bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #15: + 0x5124446 (0x7f4abb1b9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: frame #18: + 0x5af36b5 (0x7f04b10d66b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank32]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: frame #19: + 0xd2631e (0x7f04c3cc031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #20: + 0x47def4 (0x7f04c3417ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f144ae66897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank32]: frame #1: + 0x5b3a23e (0x7f148498323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f148497dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f148497df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: trainer.train(dataloader) -[default0]:[rank32]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f148497efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1484933371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1484933371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb7e4500371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1484933371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1484933371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f144c140189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #15: + 0x5124446 (0x7f90220ee446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #16: + 0x1acf4b8 (0x7f901ea994b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f144c147610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f144c166978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #12: + 0x5adc309 (0x7f1484925309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #13: + 0x5ae6f10 (0x7f148492ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #16: + 0x1acf4b8 (0x7f4ab7b644b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #14: + 0x5ae6fa5 (0x7f148492ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #15: + 0x5124446 (0x7f1483f6d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #16: + 0x1acf4b8 (0x7f14809184b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #17: + 0x5aee004 (0x7f1484937004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: trainer.train(dataloader) -[default3]:[rank43]: trainer.train(dataloader) -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank32]: frame #18: + 0x5af36b5 (0x7f148493c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #19: + 0xd2631e (0x7f149752631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #20: + 0x47def4 (0x7f1496c7def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: frame #21: + 0x1445a6 (0x5578e7e965a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #21: + 0x1445a6 (0x55699b8525a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55699b84ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: frame #23: + 0x150866 (0x55699b85e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55699b847142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55699b852a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb7e4500371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #26: PyObject_Call + 0xbc (0x55699b85ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55699b8452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55699b852a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55699b8438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5578e7e8fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #23: + 0x150866 (0x5578e7ea2866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #30: + 0x150582 (0x55699b85e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55699b8438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #32: + 0x150582 (0x55699b85e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55699b8438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #34: + 0x150582 (0x55699b85e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55699b8438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55699b84af50 in /fsx/ferdinandmom/miniforge3/en[default5]:[rank45]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5578e7e8b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -vs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55699b85cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #38: + 0x211239 (0x55699b91f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55699b84ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55699b8473e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #17: + 0x5aee004 (0x7f4abbb83004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55699b852a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55699b842c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55699b852a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #18: + 0x5af36b5 (0x7f4abbb886b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #19: + 0xd2631e (0x7f4ace77231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55699b8438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #45: + 0x150582 (0x55699b85e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank32]: frame #46: PyObject_Call + 0xbc (0x55699b85ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55699b8452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #48: + 0x150582 (0x55699b85e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #17: + 0x5aee004 (0x7f9022ab8004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #18: + 0x5af36b5 (0x7f9022abd6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #49: PyObject_Call + 0xbc (0x55699b85ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55699b8452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55699b852a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55699b84b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55699b85cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #54: + 0x211239 (0x55699b91f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #20: + 0x47def4 (0x7f4acdec9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #19: + 0xd2631e (0x7f90356a731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #55: PyObject_Call + 0x207 (0x55699b85f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55699b8452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #57: + 0x150582 (0x55699b85e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb7e4500371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55699b8438fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #59: + 0x150582 (0x55699b85e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5578e7e96a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #60: PyObject_Call + 0xbc (0x55699b85ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55699b8452b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #62: + 0x150582 (0x55699b85e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #63: PyObject_Call + 0xbc (0x55699b85ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #26: PyObject_Call + 0xbc (0x5578e7ea2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5578e7e892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank45]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5578e7e96a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #20: + 0x47def4 (0x7f9034dfeef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[def[default0]:[rank40]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -ault3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-clus[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -ter/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb7abd0d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site[default7]:[rank47]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb7abd14610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) --packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank35]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank35]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f785d2ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: frame #1: + 0x5b3a23e (0x7f7896e0b23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration + 0x1445a6 (0x56442ab8a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -> >) + 0x2c7 (0x7f7896e05c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7896e05f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7896e06fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7896dbb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7896dbb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #7: c10d::PrefixStore::get(st[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -d::string const&) + 0x31 (0x7f7896dbb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7896dbb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f785e5c8189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f785e5cf610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f785e[default1]:[rank41]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56442ab83a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #21: + 0x1445a6 (0x5626ef77b5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5626ef774a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -5ee978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #12: + 0x5adc309 (0x7f7896dad309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #13: + 0x5ae6f10 (0x7f7896db7f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #14: + 0x5ae6fa5 (0x7f7896db7fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #15: + 0x5124446 (0x7f78963f5446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #16: + 0x1acf4b8 (0x7f7892da04b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3[default0]:[rank40]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8479e88897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #17: + 0x5aee004 (0x7f7896dbf004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #18: + 0x5af36b5 (0x7f7896dc46b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #19: + 0xd2631e (0x7f78a99ae31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #20: + 0x47def4 (0x7f78a9105ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #21: + 0x1445a6 (0x5620dfe665a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -20dfe5fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #23: + 0x150866 (0x5620dfe72866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5620dfe5b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5620dfe66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #26: PyObject_Call + 0xbc (0x5620dfe72f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5620dfe592b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5620dfe66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #29: _PyEval_EvalFrame[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: frame #23: + 0x150866 (0x5626ef787866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -Default + 0x13ca (0x5620dfe578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #30: + 0x150582 (0x5620dfe72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5620dfe578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #32: + 0x150582 (0x5620dfe72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5620dfe578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #34: + 0x150582 (0x5620dfe72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5620dfe578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[ra[default5]:[rank45]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5578e7e878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -nk35]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5620dfe5ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5620dfe70c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #38: + 0x211239 (0x5620dff33239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5620dfe5fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5620dfe5b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5620dfe66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5620dfe56c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -/bin/python3.10) -[default3]:[rank35]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5620dfe66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5620dfe578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #45: + 0x150582 (0x5620dfe72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #46: PyObject_Call + 0xbc (0x5620dfe72f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5620dfe592b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #48: + 0x150582 (0x5620dfe72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #49: PyObject_Call + 0xbc (0x5620dfe72f1c in /fsx/ferdinandmom/miniforge3/envs/env-benc[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -h-cluster/bin/python3.10) -[default3]:[rank35]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5620dfe592b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5620dfe66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5620dfe5f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5620dfe70c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #54: + 0x211239 (0x5620dff33239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #55: PyObject_Call + 0x207 (0x5620dfe73067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5620dfe592b3 in /fsx/ferdinandmo[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -m/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #57: + 0x150582 (0x5620dfe72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5620dfe578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #59: + 0x150582 (0x5620dfe72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #60: PyObject_Call + 0xbc (0x5620dfe72f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5620dfe592b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #62: + 0x150582 (0x5620dfe72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #63: PyObject_Call + 0xbc (0x5620dfe72f1c in /fsx/fe[default5]:[rank45]: frame #30: + 0x150582 (0x5578e7ea2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb7abd33978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -rdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank41]: frame #23: + 0x150866 (0x56442ab96866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[def[default1]:[rank41]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56442ab7f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #1: + 0x5b3a23e (0x7f84b39a523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ault4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5578e7e878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: output = model(**micro_batch) -[default0]:[rank40]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f84b399fc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/[default1]:[rank41]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56442ab8aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #26: PyObject_Call + 0xbc (0x56442ab96f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56442ab7d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank36]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank36]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdfba9db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank36]: frame #1: + 0x5b3a23e (0x7fdff44f823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fdff44f2c87 in /fsx/ferdinandmom/miniforge3/envs/env[default2]:[rank42]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5626ef770142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #12: + 0x5adc309 (0x7fb7e44f2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) --bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fdff44f2f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fdff44f3fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdff44a8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdff44a8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdff44a8371 in /fsx/ferdinandmom/minifor[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -ge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdff44a8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fdfbbcb5189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fdfbbcbc610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fdfbbcdb978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/pyt[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -hon3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #32: + 0x150582 (0x5578e7ea2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56442ab8aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #12: + 0x5adc309 (0x7fdff449a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #13: + 0x5ae6f10 (0x7fdff44a4f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: frame #14: + 0x5ae6fa5 (0x7fdff44a4fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #15: + 0x5124446 (0x7fdff3ae2446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #16: + 0x1acf4b8 (0x7fdff048d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5626ef77ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #13: + 0x5ae6f10 (0x7fb7e44fcf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #17: + 0x5aee004 (0x7fdff44ac004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #18: + 0x5af36b5 (0x7fdff44b16b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #19: + 0xd2631e (0x7fe00709b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #20: + 0x47def4 (0x7fe0067f2ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: frame #21: + 0x1445a6 (0x5602b05795a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5602b0572a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #23: + 0x150866 (0x5602b0585866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5602b056e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5602b0579a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #26: PyObject_Call + 0xbc (0x5602b0585f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5602b056c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -/bin/python3.10) -[default4]:[rank36]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5602b0579a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5602b056a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #30: + 0x150582 (0x5602b0585582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5602b056a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #32: + 0x150582 (0x5602b0585582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5602b056a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #34: + 0x150582 (0x5602b0585582 in /fsx/ferdinandmom/mi[default5]:[rank45]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5578e7e878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -niforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5602b056a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5602b0571f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5602b0583c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #38: + 0x211239 (0x5602b0646239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5602b0572a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5602b056e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5602[default5]:[rank45]: frame #34: + 0x150582 (0x5578e7ea2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -b0579a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5602b0569c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5602b0579a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5602b056a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #45: + 0x150582 (0x5602b0585582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5578e7e878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #46: PyObject_Call + 0xbc (0x5602b0585f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5602b056c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #48: + 0x150582 (0x5602b0585582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #49: PyObject_Call + 0xbc (0x5602b0585f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5602b056c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5602b0579a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5602b0572007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/[default5]:[rank45]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5578e7e8ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -bin/python3.10) -[default4]:[rank36]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5602b0583c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #54: + 0x211239 (0x5602b0646239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #55: PyObject_Call + 0x207 (0x5602b0586067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56442ab7b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5602b056c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #57: + 0x150582 (0x5602b0585582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5602b056a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #59: + 0x150582 (0x5602b0585582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #60: PyObject_Call + 0xbc (0x5602b0585f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f84b399ff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5602b056c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #62: + 0x150582 (0x5602b0585582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #63: PyObject_Call + 0xbc (0x5602b0585f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank36]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5578e7ea0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default1]:[rank41]: frame #30: + 0x150582 (0x56442ab96582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56442ab7b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: frame #26: PyObject_Call + 0xbc (0x5626ef787f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #38: + 0x211239 (0x5578e7f63239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f84b39a0fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84b3955371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #32: + 0x150582 (0x56442ab96582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5626ef76e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5578e7e8fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: frame #14: + 0x5ae6fa5 (0x7fb7e44fcfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #15: + 0x5124446 (0x7fb7e3b3a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default2]:[rank42]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5626ef77ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #16: + 0x1acf4b8 (0x7fb7e04e54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: sharded_logits = self.model( -[default1]:[rank41]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56442ab7b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5626ef76c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5578e7e8b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5578e7e96a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #34: + 0x150582 (0x56442ab96582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #30: + 0x150582 (0x5626ef787582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default7]:[rank47]: frame #17: + 0x5aee004 (0x7fb7e4504004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #18: + 0x5af36b5 (0x7fb7e45096b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56442ab7b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5626ef76c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #19: + 0xd2631e (0x7fb7f70f331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5578e7e86c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5578e7e96a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56442ab82f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #32: + 0x150582 (0x5626ef787582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #20: + 0x47def4 (0x7fb7f684aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56442ab94c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5578e7e878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84b3955371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84b3955371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: frame #21: + 0x1445a6 (0x55fa99b705a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55fa99b69a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #38: + 0x211239 (0x56442ac57239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: frame #45: + 0x150582 (0x5578e7ea2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #23: + 0x150866 (0x55fa99b7c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55fa99b65142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default5]:[rank45]: frame #46: PyObject_Call + 0xbc (0x5578e7ea2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84b3955371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55fa99b70a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5626ef76c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56442ab83a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f847b162189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56442ab7f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56442ab8aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f847b169610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: Traceback (most recent call last): -[default5]:[rank45]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5578e7e892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #34: + 0x150582 (0x5626ef787582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5626ef76c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f847b188978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: Traceback (most recent call last): -[default6]:[rank38]: trainer.train(dataloader) -[default2]:[rank34]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: trainer.train(dataloader) -[default1]:[rank41]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56442ab7ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56442ab8aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: frame #26: PyObject_Call + 0xbc (0x55fa99b7cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank42]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5626ef773f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #48: + 0x150582 (0x5578e7ea2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default7]:[rank47]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa99b632b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: frame #49: PyObject_Call + 0xbc (0x5578e7ea2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56442ab7b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5626ef785c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: frame #38: + 0x211239 (0x5626ef848239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default1]:[rank41]: frame #45: + 0x150582 (0x56442ab96582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #12: + 0x5adc309 (0x7f84b3947309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5626ef774a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: frame #13: + 0x5ae6f10 (0x7f84b3951f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5578e7e892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55fa99b70a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: frame #14: + 0x5ae6fa5 (0x7f84b3951fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: frame #46: PyObject_Call + 0xbc (0x56442ab96f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55fa99b618fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default0]:[rank40]: frame #15: + 0x5124446 (0x7f84b2f8f446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5578e7e96a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5626ef7703e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5626ef77ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5626ef76bc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5578e7e8f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5578e7ea0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #16: + 0x1acf4b8 (0x7f84af93a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #17: + 0x5aee004 (0x7f84b3959004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56442ab7d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #30: + 0x150582 (0x55fa99b7c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55fa99b618fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: trainer.train(dataloader) -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: output = model(**micro_batch) -[default0]:[rank40]: frame #18: + 0x5af36b5 (0x7f84b395e6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #19: + 0xd2631e (0x7f84c654831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: frame #48: + 0x150582 (0x56442ab96582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: frame #32: + 0x150582 (0x55fa99b7c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default2]:[rank34]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5626ef77ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #49: PyObject_Call + 0xbc (0x56442ab96f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56442ab7d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: frame #54: + 0x211239 (0x5578e7f63239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: frame #20: + 0x47def4 (0x7f84c5c9fef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: output = model(**micro_batch) -[default6]:[rank46]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: frame #55: PyObject_Call + 0x207 (0x5578e7ea3067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5626ef76c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: frame #45: + 0x150582 (0x5626ef787582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56442ab8aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56442ab83007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5578e7e892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: frame #46: PyObject_Call + 0xbc (0x5626ef787f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default0]:[rank40]: frame #21: + 0x1445a6 (0x562d2eccf5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56442ab94c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default6]:[rank38]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55fa99b618fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default5]:[rank45]: frame #57: + 0x150582 (0x5578e7ea2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: sharded_logits = self.model( -[default7]:[rank47]: frame #34: + 0x150582 (0x55fa99b7c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55fa99b618fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5626ef76e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5578e7e878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: frame #59: + 0x150582 (0x5578e7ea2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562d2ecc8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: frame #54: + 0x211239 (0x56442ac57239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: frame #48: + 0x150582 (0x5626ef787582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default0]:[rank40]: frame #23: + 0x150866 (0x562d2ecdb866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55fa99b68f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: frame #60: PyObject_Call + 0xbc (0x5578e7ea2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: frame #55: PyObject_Call + 0x207 (0x56442ab97067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56442ab7d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5578e7e892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: frame #49: PyObject_Call + 0xbc (0x5626ef787f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5626ef76e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: frame #62: + 0x150582 (0x5578e7ea2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55fa99b7ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #38: + 0x211239 (0x55fa99c3d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: sharded_logits = self.model( -[default1]:[rank41]: frame #57: + 0x150582 (0x56442ab96582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5626ef77ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank46]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55fa99b69a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56442ab7b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: frame #63: PyObject_Call + 0xbc (0x5578e7ea2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55fa99b653e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5626ef774007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #59: + 0x150582 (0x56442ab96582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank40]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x562d2ecc4142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562d2eccfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default0]:[rank40]: frame #26: PyObject_Call + 0xbc (0x562d2ecdbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #60: PyObject_Call + 0xbc (0x56442ab96f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56442ab7d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: pipeline_state.run_communication() -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5626ef785c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x562d2ecc22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: frame #62: + 0x150582 (0x56442ab96582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank34]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: dist.recv( -[default1]:[rank57]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562d2eccfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x562d2ecc08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42bb833897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default2]:[rank42]: frame #54: + 0x211239 (0x5626ef848239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return func(*args, **kwargs) -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: frame #55: PyObject_Call + 0x207 (0x5626ef788067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #1: + 0x5b3a23e (0x7f42f535023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: Traceback (most recent call last): -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55fa99b70a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55fa99b60c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f42f534ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55fa99b70a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f42f534af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default6]:[rank62]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: frame #63: PyObject_Call + 0xbc (0x56442ab96f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default1]:[rank41]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f42f534bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank40]: frame #30: + 0x150582 (0x562d2ecdb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55fa99b618fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwarg[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -s) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default3]:[rank43]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff024972897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x562d2ecc08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: Traceback (most recent call last): -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: frame #1: + 0x5b3a23e (0x7ff05e48f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: frame #32: + 0x150582 (0x562d2ecdb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f42f5300371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: trainer.train(dataloader) -[default2]:[rank34]: dist.recv( -[default2]:[rank42]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5626ef76e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: frame #57: + 0x150582 (0x5626ef787582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f621f990897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank58]: Traceback (most recent call last): -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff05e489c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f42f5300371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default4]:[rank44]: pipeline_state.run_communication() -[default2]:[rank42]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5626ef76c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank53]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff05e489f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #1: + 0x5b3a23e (0x7f62594ad23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank42]: frame #59: + 0x150582 (0x5626ef787582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff05e48afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f62594a7c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: frame #45: + 0x150582 (0x55fa99b7c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff05e43f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f42f5300371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff05e43f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: frame #60: PyObject_Call + 0xbc (0x5626ef787f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x562d2ecc08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #34: + 0x150582 (0x562d2ecdb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: pipeline_state.run_communication() -[default0]:[rank40]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x562d2ecc08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f42f5300371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank46]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f62594a7f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff05e43f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562d2ecc7f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank34]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5626ef76e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f62594a8fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: dist.recv( -[default6]:[rank38]: dist.recv( -[default6]:[rank46]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f625945d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: frame #46: PyObject_Call + 0xbc (0x55fa99b7cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: frame #62: + 0x150582 (0x5626ef787582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f42bcb0d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f939e0be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa99b632b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f42bcb14610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562d2ecd9c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: frame #1: + 0x5b3a23e (0x7f93d7bdb23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #63: PyObject_Call + 0xbc (0x5626ef787f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: trainer.train(dataloader) -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default5]:[rank53]: return func(*args, **kwargs) -[default2]:[rank34]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f93d7bd5c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff05e43f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank43]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff025c4c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #48: + 0x150582 (0x55fa99b7c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f625945d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47]: frame #49: PyObject_Call + 0xbc (0x55fa99b7cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa99b632b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb615544897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff025c53610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #1: + 0x5b3a23e (0x7fb64f06123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank38]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a47f4a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb64f05bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f625945d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f42bcb33978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f17a6acd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: frame #38: + 0x211239 (0x562d2ed9c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562d2ecc8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: frame #1: + 0x5b3a23e (0x7f5a81a6723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x562d2ecc43e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff025c72978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb64f05bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #1: + 0x5b3a23e (0x7f17e05ea23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55fa99b70a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default2]:[rank34]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f93d7bd5f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562d2eccfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: frame #12: + 0x5adc309 (0x7f42f52f2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562d2ecbfc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #12: + 0x5adc309 (0x7ff05e431309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #13: + 0x5ae6f10 (0x7ff05e43bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: output = model(**micro_batch) -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5a81a61c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562d2eccfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #13: + 0x5ae6f10 (0x7f42f52fcf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f17e05e4c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f93d7bd6fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x562d2ecc08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f17e05e4f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: dist.recv( -[default6]:[rank46]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f625945d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f17e05e5fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5a81a61f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93d7b8b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6220c6a189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55fa99b69007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93d7b8b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6220c71610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return func(*args, **kwargs) -[default6]:[rank38]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5a81a62fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6220c90978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17e059a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93d7b8b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55fa99b7ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: frame #14: + 0x5ae6fa5 (0x7f42f52fcfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: frame #45: + 0x150582 (0x562d2ecdb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb64f05cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: frame #46: PyObject_Call + 0xbc (0x562d2ecdbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #15: + 0x5124446 (0x7f42f493a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17e059a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93d7b8b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #14: + 0x5ae6fa5 (0x7ff05e43bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #15: + 0x5124446 (0x7ff05da79446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb64f011371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17e059a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5a81a17371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: frame #54: + 0x211239 (0x55fa99c3d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb64f011371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f939f398189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x562d2ecc22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #55: PyObject_Call + 0x207 (0x55fa99b7d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb64f011371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: dist.recv( -[default5]:[rank37]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank46]: frame #12: + 0x5adc309 (0x7f625944f309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: dist.recv( -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f939f39f610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #13: + 0x5ae6f10 (0x7f6259459f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17e059a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: dist.recv( -[default7]:[rank55]: Traceback (most recent call last): -[default2]:[rank34]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f939f3be978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa99b632b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f17a7da7189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b77029897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: frame #57: + 0x150582 (0x55fa99b7c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return func(*args, **kwargs) -[default2]:[rank34]: frame #12: + 0x5adc309 (0x7f93d7b7d309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: frame #48: + 0x150582 (0x562d2ecdb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f17a7dae610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: frame #13: + 0x5ae6f10 (0x7f93d7b87f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f17a7dcd978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #1: + 0x5b3a23e (0x7f5bb0b4623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55fa99b618fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5bb0b40c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #14: + 0x5ae6fa5 (0x7f6259459fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: frame #12: + 0x5adc309 (0x7f17e058c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5bb0b40f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #15: + 0x5124446 (0x7f6258a97446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: frame #13: + 0x5ae6f10 (0x7f17e0596f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5a81a17371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #49: PyObject_Call + 0xbc (0x562d2ecdbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x562d2ecc22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb64f011371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb61681e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5bb0b41fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank57]: frame #16: + 0x1acf4b8 (0x7f42f12e54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5bb0af6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #59: + 0x150582 (0x55fa99b7c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe6b44ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank34]: frame #14: + 0x5ae6fa5 (0x7f93d7b87fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #16: + 0x1acf4b8 (0x7f62554424b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb616825610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #1: + 0x5b3a23e (0x7fe6edfc823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: frame #15: + 0x5124446 (0x7f93d71c5446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f053b2c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5bb0af6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #16: + 0x1acf4b8 (0x7ff05a4244b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: frame #17: + 0x5aee004 (0x7f42f5304004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5a81a17371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank43]: frame #17: + 0x5aee004 (0x7ff05e443004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #60: PyObject_Call + 0xbc (0x55fa99b7cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562d2eccfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #1: + 0x5b3a23e (0x7f0574ddf23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #18: + 0x5af36b5 (0x7f42f53096b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #19: + 0xd2631e (0x7f4307ef331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5bb0af6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #16: + 0x1acf4b8 (0x7f93d3b704b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #18: + 0x5af36b5 (0x7ff05e4486b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb616844978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank40]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562d2ecc8007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #12: + 0x5adc309 (0x7fb64f003309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: frame #20: + 0x47def4 (0x7f430764aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #21: + 0x1445a6 (0x5565dabca5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #17: + 0x5aee004 (0x7f93d7b8f004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #17: + 0x5aee004 (0x7f6259461004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5565dabc3a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5a81a17371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f164d8d3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank46]: frame #18: + 0x5af36b5 (0x7f62594666b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: trainer.train(dataloader) -[default2]:[rank34]: frame #18: + 0x5af36b5 (0x7f93d7b946b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0574dd9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #13: + 0x5ae6f10 (0x7fb64f00df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: output = model(**micro_batch) -[default5]:[rank37]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5bb0af6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #19: + 0xd2631e (0x7f626c05031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #23: + 0x150866 (0x5565dabd6866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: frame #1: + 0x5b3a23e (0x7f16873f023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #19: + 0xd2631e (0x7f93ea77e31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #19: + 0xd2631e (0x7ff07103231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #20: + 0x47def4 (0x7f626b7a7ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5565dabbf142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fe6edfc2c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #14: + 0x5ae6fa5 (0x7f17e0596fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5b78303189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #21: + 0x1445a6 (0x560769b205a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: Traceback (most recent call last): -[default5]:[rank61]: frame #14: + 0x5ae6fa5 (0x7fb64f00dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #15: + 0x5124446 (0x7fb64e64b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fe6edfc2f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #20: + 0x47def4 (0x7f93e9ed5ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa99b632b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: sharded_logits = self.model( -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5565dabcaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #16: + 0x1acf4b8 (0x7fb64aff64b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #17: + 0x5aee004 (0x7fb64f015004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #18: + 0x5af36b5 (0x7fb64f01a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fe6edfc3fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #15: + 0x5124446 (0x7f17dfbd4446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5b7830a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562d2ecd9c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #54: + 0x211239 (0x562d2ed9c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default7]:[rank39]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f16873eac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #62: + 0x150582 (0x55fa99b7c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: trainer.train(dataloader) -[default5]:[rank61]: frame #19: + 0xd2631e (0x7fb661c0431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: frame #16: + 0x1acf4b8 (0x7f17dc57f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #17: + 0x5aee004 (0x7f17e059e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5b78329978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0574dd9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f16873eaf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #55: PyObject_Call + 0x207 (0x562d2ecdc067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: frame #21: + 0x1445a6 (0x55635db255a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #12: + 0x5adc309 (0x7f5bb0ae8309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x562d2ecc22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default5]:[rank61]: frame #20: + 0x47def4 (0x7fb66135bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #21: + 0x1445a6 (0x560b1b97a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #22: _PyObject_MakeTpCall + 0x26b (0x560b1b973a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #18: + 0x5af36b5 (0x7f17e05a36b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f16873ebfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0574ddafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: frame #19: + 0xd2631e (0x7f17f318d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5a49224189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #22: _PyObject_MakeTpCall + 0x26b (0x560769b19a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default5]:[rank37]: frame #13: + 0x5ae6f10 (0x7f5bb0af2f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #20: + 0x47def4 (0x7ff070789ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #21: + 0x1445a6 (0x55b6377c15a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #23: + 0x150866 (0x560b1b986866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe6edf78371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #20: + 0x47def4 (0x7f17f28e4ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f16873a0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #23: + 0x150866 (0x560769b2c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: frame #26: PyObject_Call + 0xbc (0x5565dabd6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #21: + 0x1445a6 (0x5629e09cc5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe6edf78371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55635db1ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #63: PyObject_Call + 0xbc (0x55fa99b7cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5565dabbd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x560b1b96f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: frame #14: + 0x5ae6fa5 (0x7f5bb0af2fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b6377baa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default4]:[rank52]: sharded_logits = self.model( -[default7]:[rank39]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f16873a0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #57: + 0x150582 (0x562d2ecdb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe6edf78371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe6edf78371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #23: + 0x150866 (0x55635db31866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5a4922b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0574d8f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5629e09c5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f16873a0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0574d8f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0574d8f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default5]:[rank37]: frame #15: + 0x5124446 (0x7f5bb0130446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5a4924a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x562d2ecc08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #23: + 0x150866 (0x5629e09d8866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5629e09c1142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fe6b5785189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f16873a0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x560769b15142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #25: _PyFunction_Vectorcall + 0x6c (0x560769b20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: frame #16: + 0x1acf4b8 (0x7f5bacadb4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #26: PyObject_Call + 0xbc (0x560769b2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #59: + 0x150582 (0x562d2ecdb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #60: PyObject_Call + 0xbc (0x562d2ecdbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fe6b578c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fe6b57ab978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #12: + 0x5adc309 (0x7f5a81a09309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default7]:[rank39]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f164ebad189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x560769b132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #23: + 0x150866 (0x55b6377cd866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #12: + 0x5adc309 (0x7fe6edf6a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55635db1a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0574d8f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: frame #13: + 0x5ae6f10 (0x7fe6edf74f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f164ebb4610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #28: _PyFunction_Vectorcall + 0x6c (0x560769b20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5565dabcaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: frame #17: + 0x5aee004 (0x7f5bb0afa004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x560769b118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5565dabbb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #18: + 0x5af36b5 (0x7f5bb0aff6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55635db25a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f053c59c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f164ebd3978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f053c5a3610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: frame #13: + 0x5ae6f10 (0x7f5a81a13f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b6377b6142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: frame #26: PyObject_Call + 0xbc (0x55635db31f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b6377c1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55635db182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x562d2ecc22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #30: + 0x150582 (0x5565dabd6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #12: + 0x5adc309 (0x7f1687392309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #19: + 0xd2631e (0x7f5bc36e931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55635db25a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #14: + 0x5ae6fa5 (0x7f5a81a13fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #62: + 0x150582 (0x562d2ecdb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: frame #13: + 0x5ae6f10 (0x7f168739cf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #26: PyObject_Call + 0xbc (0x55b6377cdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #25: _PyFunction_Vectorcall + 0x6c (0x560b1b97aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55635db168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6377b42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #26: PyObject_Call + 0xbc (0x560b1b986f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: frame #30: + 0x150582 (0x55635db31582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #63: PyObject_Call + 0xbc (0x562d2ecdbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55635db168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #20: + 0x47def4 (0x7f5bc2e40ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x560b1b96d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: frame #14: + 0x5ae6fa5 (0x7f168739cfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #32: + 0x150582 (0x55635db31582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f053c5c2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5565dabbb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #21: + 0x1445a6 (0x563ccbf265a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563ccbf1fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #12: + 0x5adc309 (0x7f0574d81309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55635db168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #34: + 0x150582 (0x55635db31582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b6377c1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b6377b28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: frame #15: + 0x5124446 (0x7f16869da446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55635db168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #30: + 0x150582 (0x560769b2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #32: + 0x150582 (0x5565dabd6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55635db1df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #23: + 0x150866 (0x563ccbf32866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #30: + 0x150582 (0x55b6377cd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #13: + 0x5ae6f10 (0x7f0574d8bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #28: _PyFunction_Vectorcall + 0x6c (0x560b1b97aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x560b1b96b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #15: + 0x5124446 (0x7f5a81051446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b6377b28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #30: + 0x150582 (0x560b1b986582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #16: + 0x1acf4b8 (0x7f16833854b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #32: + 0x150582 (0x55b6377cd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: output = model(**micro_batch) -[default1]:[rank57]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5565dabbb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563ccbf1b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b6377b28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #34: + 0x150582 (0x5565dabd6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: frame #16: + 0x1acf4b8 (0x7f5a7d9fc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #34: + 0x150582 (0x55b6377cd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x560b1b96b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5629e09cca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563ccbf26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55635db2fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x560769b118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: frame #17: + 0x5aee004 (0x7f5a81a1b004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #14: + 0x5ae6fa5 (0x7f0574d8bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: frame #17: + 0x5aee004 (0x7f16873a4004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #32: + 0x150582 (0x560769b2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #26: PyObject_Call + 0xbc (0x5629e09d8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #38: + 0x211239 (0x55635dbf2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b6377b28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #15: + 0x5124446 (0x7f05743c9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5629e09bf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #14: + 0x5ae6fa5 (0x7fe6edf74fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #18: + 0x5af36b5 (0x7f16873a96b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #18: + 0x5af36b5 (0x7f5a81a206b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b6377b9f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #15: + 0x5124446 (0x7fe6ed5b2446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default2]:[rank34]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55635db1ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #16: + 0x1acf4b8 (0x7f0570d744b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55635db1a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #26: PyObject_Call + 0xbc (0x563ccbf32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #19: + 0xd2631e (0x7f5a9460a31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b6377cbc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5629e09cca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5629e09bd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #19: + 0xd2631e (0x7f1699f9331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #17: + 0x5aee004 (0x7f0574d93004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: frame #16: + 0x1acf4b8 (0x7fe6e9f5d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #20: + 0x47def4 (0x7f5a93d61ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #38: + 0x211239 (0x55b63788e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5565dabbb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: frame #30: + 0x150582 (0x5629e09d8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5629e09bd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #20: + 0x47def4 (0x7f16996eaef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #18: + 0x5af36b5 (0x7f0574d986b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x560769b118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b6377baa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: frame #17: + 0x5aee004 (0x7fe6edf7c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #21: + 0x1445a6 (0x55796dd125a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #19: + 0xd2631e (0x7f058798231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5565dabc2f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55635db25a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b6377b63e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: pipeline_state.run_communication() -[default5]:[rank53]: frame #32: + 0x150582 (0x5629e09d8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5629e09bd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #21: + 0x1445a6 (0x5578422375a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #20: + 0x47def4 (0x7f05870d9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default5]:[rank53]: frame #34: + 0x150582 (0x5629e09d8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55635db15c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b6377c1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5565dabd4c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55635db25a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563ccbf192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557842230a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #21: + 0x1445a6 (0x55bd46d555a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b6377b1c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #34: + 0x150582 (0x560769b2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #32: + 0x150582 (0x560b1b986582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x560b1b96b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #18: + 0x5af36b5 (0x7fe6edf816b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #19: + 0xd2631e (0x7fe700b6b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55796dd0ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b6377c1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55635db168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x560769b118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: frame #23: + 0x150866 (0x557842243866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b6377b28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5629e09bd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5629e09c4f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #23: + 0x150866 (0x55796dd1e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #45: + 0x150582 (0x55b6377cd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default5]:[rank61]: frame #34: + 0x150582 (0x560b1b986582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55796dd07142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #45: + 0x150582 (0x55635db31582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #46: PyObject_Call + 0xbc (0x55635db31f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55bd46d4ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x560b1b96b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #20: + 0x47def4 (0x7fe7002c2ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #21: + 0x1445a6 (0x55a1dae565a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55796dd12a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #46: PyObject_Call + 0xbc (0x55b6377cdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55784222c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #23: + 0x150866 (0x55bd46d61866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x560769b18f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6377b42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x560b1b972f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557842237a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #37: _PyObject_Call_Prepend + 0x69 (0x560769b2ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: frame #38: + 0x211239 (0x5565dac97239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5629e09d6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55635db182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #48: + 0x150582 (0x55b6377cd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #49: PyObject_Call + 0xbc (0x55b6377cdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5565dabc3a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #38: + 0x211239 (0x5629e0a99239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #26: PyObject_Call + 0xbc (0x55796dd1ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563ccbf26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #38: + 0x211239 (0x560769bed239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: frame #48: + 0x150582 (0x55635db31582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #39: _PyObject_MakeTpCall + 0x26b (0x560769b19a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6377b42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55bd46d4a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55bd46d55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a1dae4fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #23: + 0x150866 (0x55a1dae62866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55796dd052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b6377c1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a1dae4b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: pipeline_state.run_communication() -[default5]:[rank37]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563ccbf178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #26: PyObject_Call + 0xbc (0x55bd46d61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: frame #30: + 0x150582 (0x563ccbf32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b6377ba007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x560769b153e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default5]:[rank53]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5629e09c5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #49: PyObject_Call + 0xbc (0x55635db31f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b6377cbc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55796dd12a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #54: + 0x211239 (0x55b63788e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: frame #37: _PyObject_Call_Prepend + 0x69 (0x560b1b984c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a1dae56a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: frame #26: PyObject_Call + 0xbc (0x557842243f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #55: PyObject_Call + 0x207 (0x55b6377ce067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #41: _PyFunction_Vectorcall + 0x6c (0x560769b20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5629e09c13e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5629e09cca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55635db182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6377b42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5629e09bcc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55796dd038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x560769b10c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #26: PyObject_Call + 0xbc (0x55a1dae62f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5629e09cca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #30: + 0x150582 (0x55796dd1e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563ccbf178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55784222a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #57: + 0x150582 (0x55b6377cd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd46d482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55635db25a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #32: + 0x150582 (0x563ccbf32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b6377b28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557842237a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55bd46d55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5629e09bd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5578422288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #43: _PyFunction_Vectorcall + 0x6c (0x560769b20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank55]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank37]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563ccbf178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #59: + 0x150582 (0x55b6377cd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55bd46d468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5565dabbf3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5565dabcaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55796dd038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #60: PyObject_Call + 0xbc (0x55b6377cdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x560769b118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #30: + 0x150582 (0x55bd46d61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: frame #30: + 0x150582 (0x557842243582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #45: + 0x150582 (0x560769b2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: dist.recv( -[default1]:[rank57]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5565dabbac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #38: + 0x211239 (0x560b1ba47239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55635db1e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55bd46d468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #32: + 0x150582 (0x55bd46d61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55635db2fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #34: + 0x150582 (0x563ccbf32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b6377b42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #46: PyObject_Call + 0xbc (0x560769b2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x560769b132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: frame #32: + 0x150582 (0x55796dd1e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55bd46d468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #62: + 0x150582 (0x55b6377cd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55796dd038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5578422288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563ccbf178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #34: + 0x150582 (0x55bd46d61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default5]:[rank53]: frame #45: + 0x150582 (0x5629e09d8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #54: + 0x211239 (0x55635dbf2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #63: PyObject_Call + 0xbc (0x55b6377cdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #48: + 0x150582 (0x560769b2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #39: _PyObject_MakeTpCall + 0x26b (0x560b1b973a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x560b1b96f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return func(*args, **kwargs) -[default5]:[rank53]: frame #46: PyObject_Call + 0xbc (0x5629e09d8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #34: + 0x150582 (0x55796dd1e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank44]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55bd46d468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #49: PyObject_Call + 0xbc (0x560769b2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5565dabcaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5565dabbb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563ccbf1ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55bd46d4df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: frame #55: PyObject_Call + 0x207 (0x55635db32067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55bd46d5fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #38: + 0x211239 (0x55bd46e22239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5629e09bf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #48: + 0x150582 (0x5629e09d8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55635db182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563ccbf30c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #32: + 0x150582 (0x557842243582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x560769b132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #51: _PyFunction_Vectorcall + 0x6c (0x560769b20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55796dd038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x560769b19007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #53: _PyObject_Call_Prepend + 0x69 (0x560769b2ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #41: _PyFunction_Vectorcall + 0x6c (0x560b1b97aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #49: PyObject_Call + 0xbc (0x5629e09d8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55796dd0af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #54: + 0x211239 (0x560769bed239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x560b1b96ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5629e09bf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #38: + 0x211239 (0x563ccbff3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55bd46d4ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55796dd1cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55bd46d4a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #55: PyObject_Call + 0x207 (0x560769b2d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: frame #57: + 0x150582 (0x55635db31582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55bd46d55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55bd46d45c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55bd46d55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x560769b132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5629e09cca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5629e09c5007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5578422288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55bd46d468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5629e09d6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank38]: frame #38: + 0x211239 (0x55796dddf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55635db168fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #45: + 0x150582 (0x55bd46d61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: dist.recv( -[default7]:[rank39]: frame #34: + 0x150582 (0x557842243582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #57: + 0x150582 (0x560769b2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x560769b118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #59: + 0x150582 (0x560769b2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #60: PyObject_Call + 0xbc (0x560769b2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: dist.recv( -[default5]:[rank61]: frame #43: _PyFunction_Vectorcall + 0x6c (0x560b1b97aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #45: + 0x150582 (0x5565dabd6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: dist.recv( -[default5]:[rank37]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563ccbf1fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x560769b132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank53]: frame #54: + 0x211239 (0x5629e0a99239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1dae492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #59: + 0x150582 (0x55635db31582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #46: PyObject_Call + 0xbc (0x55bd46d61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563ccbf1b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #62: + 0x150582 (0x560769b2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #46: PyObject_Call + 0xbc (0x5565dabd6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: return func(*args, **kwargs) -[default5]:[rank37]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563ccbf26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5578422288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd46d482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a1dae56a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a1dae478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55796dd0ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #48: + 0x150582 (0x55bd46d61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #49: PyObject_Call + 0xbc (0x55bd46d61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55796dd073e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #60: PyObject_Call + 0xbc (0x55635db31f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55784222ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd46d482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #30: + 0x150582 (0x55a1dae62582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a1dae478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563ccbf16c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #63: PyObject_Call + 0xbc (0x560769b2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55796dd12a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55bd46d55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5565dabbd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55635db182b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank62]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x560b1b96b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank55]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank55]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563ccbf26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55796dd02c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55bd46d4e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55bd46d5fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #54: + 0x211239 (0x55bd46e22239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #45: + 0x150582 (0x560b1b986582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: frame #32: + 0x150582 (0x55a1dae62582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a1dae478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #62: + 0x150582 (0x55635db31582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557842241c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #55: PyObject_Call + 0x207 (0x55bd46d62067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd46d482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #57: + 0x150582 (0x55bd46d61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: return func(*args, **kwargs) -[default7]:[rank55]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa46cd36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank37]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563ccbf178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55bd46d468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #59: + 0x150582 (0x55bd46d61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #60: PyObject_Call + 0xbc (0x55bd46d61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: frame #46: PyObject_Call + 0xbc (0x560b1b986f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #55: PyObject_Call + 0x207 (0x5629e09d9067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #63: PyObject_Call + 0xbc (0x55635db31f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd46d482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #62: + 0x150582 (0x55bd46d61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #63: PyObject_Call + 0xbc (0x55bd46d61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank52]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2dd7b77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: frame #1: + 0x5b3a23e (0x7fa4a685323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55796dd12a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank57]: frame #48: + 0x150582 (0x5565dabd6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5629e09bf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #57: + 0x150582 (0x5629e09d8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank57]: frame #49: PyObject_Call + 0xbc (0x5565dabd6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5629e09bd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa4a684dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa4a684df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55796dd038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa4a684efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #45: + 0x150582 (0x563ccbf32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: frame #59: + 0x150582 (0x5629e09d8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #38: + 0x211239 (0x557842304239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #45: + 0x150582 (0x55796dd1e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x560b1b96d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: frame #34: + 0x150582 (0x55a1dae62582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #46: PyObject_Call + 0xbc (0x563ccbf32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: frame #1: + 0x5b3a23e (0x7f2e1169423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #46: PyObject_Call + 0xbc (0x55796dd1ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: pipeline_state.run_communication() -[default5]:[rank53]: frame #60: PyObject_Call + 0xbc (0x5629e09d8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563ccbf192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557842230a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa4a6803371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55796dd052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5565dabbd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5565dabcaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5629e09bf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #48: + 0x150582 (0x55796dd1e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55784222c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: frame #62: + 0x150582 (0x5629e09d8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #49: PyObject_Call + 0xbc (0x55796dd1ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: frame #48: + 0x150582 (0x560b1b986582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #63: PyObject_Call + 0xbc (0x5629e09d8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa4a6803371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557842237a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #49: PyObject_Call + 0xbc (0x560b1b986f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd16d694897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa4a6803371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2e1168ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55796dd052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #48: + 0x150582 (0x563ccbf32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #1: + 0x5b3a23e (0x7fd1a71b123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a1dae478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55796dd12a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x560b1b96d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa4a6803371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank37]: frame #49: PyObject_Call + 0xbc (0x563ccbf32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557842227c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55796dd0b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd1a71abc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557842237a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55796dd1cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd1a71abf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #54: + 0x211239 (0x55796dddf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563ccbf192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5578422288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd1a71acfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #55: PyObject_Call + 0x207 (0x55796dd1f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #51: _PyFunction_Vectorcall + 0x6c (0x560b1b97aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank39]: frame #45: + 0x150582 (0x557842243582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa46e010189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2e1168ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f2e1168ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55796dd052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #57: + 0x150582 (0x55796dd1e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5565dabc3007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a1dae4ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #46: PyObject_Call + 0xbc (0x557842243f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563ccbf26a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5565dabd4c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd1a7161371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a1dae60c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55796dd038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa46e017610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #59: + 0x150582 (0x55796dd1e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2e11644371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55784222a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2e11644371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #38: + 0x211239 (0x55a1daf23239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a1dae4fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563ccbf1f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #60: PyObject_Call + 0xbc (0x55796dd1ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55796dd052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #62: + 0x150582 (0x55796dd1e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #63: PyObject_Call + 0xbc (0x55796dd1ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd1a7161371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2e11644371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa46e036978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a1dae4b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank39]: frame #48: + 0x150582 (0x557842243582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563ccbf30c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a1dae56a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #49: PyObject_Call + 0xbc (0x557842243f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2e11644371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #12: + 0x5adc309 (0x7fa4a67f5309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #13: + 0x5ae6f10 (0x7fa4a67fff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55784222a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2dd8e51189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557842237a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2dd8e58610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557842230007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a1dae46c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a1dae56a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #54: + 0x211239 (0x563ccbff3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #55: PyObject_Call + 0x207 (0x563ccbf33067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a1dae478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #14: + 0x5ae6fa5 (0x7fa4a67fffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2dd8e77978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557842241c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: frame #12: + 0x5adc309 (0x7f2e11636309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #54: + 0x211239 (0x557842304239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #55: PyObject_Call + 0x207 (0x557842244067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank62]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd1a7161371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #15: + 0x5124446 (0x7fa4a5e3d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55784222a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #57: + 0x150582 (0x557842243582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a7223f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank49]: frame #45: + 0x150582 (0x55a1dae62582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #16: + 0x1acf4b8 (0x7fa4a27e84b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5578422288fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #59: + 0x150582 (0x557842243582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #54: + 0x211239 (0x5565dac97239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #55: PyObject_Call + 0x207 (0x5565dabd7067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #17: + 0x5aee004 (0x7fa4a6807004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #60: PyObject_Call + 0xbc (0x557842243f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55784222a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #62: + 0x150582 (0x557842243582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5565dabbd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #46: PyObject_Call + 0xbc (0x55a1dae62f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #63: PyObject_Call + 0xbc (0x557842243f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank58]: dist.recv( -[default4]:[rank52]: frame #13: + 0x5ae6f10 (0x7f2e11640f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563ccbf192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #57: + 0x150582 (0x563ccbf32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563ccbf178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #59: + 0x150582 (0x563ccbf32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #1: + 0x5b3a23e (0x7f4aabd5c23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #18: + 0x5af36b5 (0x7fa4a680c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #60: PyObject_Call + 0xbc (0x563ccbf32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563ccbf192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #62: + 0x150582 (0x563ccbf32582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank55]: frame #19: + 0xd2631e (0x7fa4b93f631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #63: PyObject_Call + 0xbc (0x563ccbf32f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default7]:[rank55]: frame #20: + 0x47def4 (0x7fa4b8b4def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd1a7161371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd16e96e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: frame #21: + 0x1445a6 (0x560aabbb65a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4aabd56c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #22: _PyObject_MakeTpCall + 0x26b (0x560aabbafa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1dae492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: frame #48: + 0x150582 (0x55a1dae62582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: trainer.train(dataloader) -[default0]:[rank48]: trainer.train(dataloader) -[default5]:[rank61]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x560b1b973007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #57: + 0x150582 (0x5565dabd6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #23: + 0x150866 (0x560aabbc2866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5565dabbb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: frame #14: + 0x5ae6fa5 (0x7f2e11640fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #15: + 0x5124446 (0x7f2e10c7e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: frame #53: _PyObject_Call_Prepend + 0x69 (0x560b1b984c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x560aabbab142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #16: + 0x1acf4b8 (0x7f2e0d6294b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #59: + 0x150582 (0x5565dabd6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #17: + 0x5aee004 (0x7f2e11648004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: frame #18: + 0x5af36b5 (0x7f2e1164d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: frame #25: _PyFunction_Vectorcall + 0x6c (0x560aabbb6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank59]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4aabd56f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #60: PyObject_Call + 0xbc (0x5565dabd6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd16e975610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd16e994978 in /fsx/ferdinandmom/miniforge3/envs/[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: frame #49: PyObject_Call + 0xbc (0x55a1dae62f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #26: PyObject_Call + 0xbc (0x560aabbc2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4aabd57fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank56]: dist.recv( -[default1]:[rank57]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5565dabbd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #54: + 0x211239 (0x560b1ba47239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #55: PyObject_Call + 0x207 (0x560b1b987067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1dae492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: frame #62: + 0x150582 (0x5565dabd6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #63: PyObject_Call + 0xbc (0x5565dabd6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank55]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x560aabba92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #28: _PyFunction_Vectorcall + 0x6c (0x560aabbb6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x560b1b96d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: return func(*args, **kwargs) -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: frame #19: + 0xd2631e (0x7f2e2423731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x560aabba78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #12: + 0x5adc309 (0x7fd1a7153309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #13: + 0x5ae6f10 (0x7fd1a715df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1916ee1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a1dae56a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: frame #30: + 0x150582 (0x560aabbc2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: frame #20: + 0x47def4 (0x7f2e2398eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4aabd0c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x560aabba78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #32: + 0x150582 (0x560aabbc2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #21: + 0x1445a6 (0x559f8ea555a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #1: + 0x5b3a23e (0x7f19509fe23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: frame #57: + 0x150582 (0x560b1b986582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: output = model(**micro_batch) -[default3]:[rank59]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4aabd0c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559f8ea4ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #23: + 0x150866 (0x559f8ea61866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4aabd0c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f19509f8c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559f8ea4a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f19509f8f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559f8ea55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #14: + 0x5ae6fa5 (0x7fd1a715dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a1dae4f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a1dae60c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f19509f9fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f19509ae371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f19509ae371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #54: + 0x211239 (0x55a1daf23239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #26: PyObject_Call + 0xbc (0x559f8ea61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x560b1b96b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: frame #59: + 0x150582 (0x560b1b986582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: frame #55: PyObject_Call + 0x207 (0x55a1dae63067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4aabd0c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x560aabba78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #34: + 0x150582 (0x560aabbc2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f4a73519189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f19509ae371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1dae492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #60: PyObject_Call + 0xbc (0x560b1b986f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #57: + 0x150582 (0x55a1dae62582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #15: + 0x5124446 (0x7fd1a679b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x560aabba78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: dist.recv( -[default6]:[rank62]: frame #16: + 0x1acf4b8 (0x7fd1a31464b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x560aabbaef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559f8ea482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #17: + 0x5aee004 (0x7fd1a7165004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #18: + 0x5af36b5 (0x7fd1a716a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f19509ae371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f19181bb189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4a73520610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank56]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x560b1b96d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank59]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4a7353f978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b312c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank56]: frame #1: + 0x5b3a23e (0x7f5b6ade123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559f8ea55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: frame #19: + 0xd2631e (0x7fd1b9d5431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559f8ea468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #62: + 0x150582 (0x560b1b986582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: frame #20: + 0x47def4 (0x7fd1b94abef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #21: + 0x1445a6 (0x555cf1a4f5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #30: + 0x150582 (0x559f8ea61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559f8ea468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555cf1a48a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #63: PyObject_Call + 0xbc (0x560b1b986f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5b6addbc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5b6addbf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #37: _PyObject_Call_Prepend + 0x69 (0x560aabbc0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5b6addcfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf90954897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #1: + 0x5b3a23e (0x7fdfca47123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #23: + 0x150866 (0x555cf1a5b866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a1dae478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #59: + 0x150582 (0x55a1dae62582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: frame #12: + 0x5adc309 (0x7f4aabcfe309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #32: + 0x150582 (0x559f8ea61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #13: + 0x5ae6f10 (0x7f4aabd08f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559f8ea468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555cf1a44142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555cf1a4fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #60: PyObject_Call + 0xbc (0x55a1dae62f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #26: PyObject_Call + 0xbc (0x555cf1a5bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555cf1a422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #38: + 0x211239 (0x560aabc83239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #39: _PyObject_MakeTpCall + 0x26b (0x560aabbafa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a1dae492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #34: + 0x150582 (0x559f8ea61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559f8ea468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x560aabbab3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default7]:[rank55]: frame #41: _PyFunction_Vectorcall + 0x6c (0x560aabbb6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555cf1a4fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x560aabba6c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5b6ad91371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: frame #14: + 0x5ae6fa5 (0x7f4aabd08fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #15: + 0x5124446 (0x7f4aab346446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #16: + 0x1acf4b8 (0x7f4aa7cf14b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559f8ea4df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #43: _PyFunction_Vectorcall + 0x6c (0x560aabbb6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x560aabba78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f19181c2610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559f8ea5fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555cf1a408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #30: + 0x150582 (0x555cf1a5b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default3]:[rank59]: frame #17: + 0x5aee004 (0x7f4aabd10004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #18: + 0x5af36b5 (0x7f4aabd156b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #45: + 0x150582 (0x560aabbc2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #46: PyObject_Call + 0xbc (0x560aabbc2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555cf1a408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fdfca46bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #38: + 0x211239 (0x559f8eb22239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559f8ea4ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fdfca46bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: sharded_logits = self.model( -[default2]:[rank58]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f19181e1978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #32: + 0x150582 (0x555cf1a5b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fdfca46cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x560aabba92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #48: + 0x150582 (0x560aabbc2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #12: + 0x5adc309 (0x7f19509a0309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555cf1a408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #34: + 0x150582 (0x555cf1a5b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #62: + 0x150582 (0x55a1dae62582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #63: PyObject_Call + 0xbc (0x55a1dae62f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #13: + 0x5ae6f10 (0x7f19509aaf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank55]: frame #49: PyObject_Call + 0xbc (0x560aabbc2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x560aabba92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdfca421371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555cf1a408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555cf1a47f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555cf1a59c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #19: + 0xd2631e (0x7f4abe8ff31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #14: + 0x5ae6fa5 (0x7f19509aafa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #15: + 0x5124446 (0x7f194ffe8446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5b6ad91371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5b6ad91371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #16: + 0x1acf4b8 (0x7f194c9934b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #17: + 0x5aee004 (0x7f19509b2004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #18: + 0x5af36b5 (0x7f19509b76b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #38: + 0x211239 (0x555cf1b1c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555cf1a48a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdfca421371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5b6ad91371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5b3259e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559f8ea4a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559f8ea55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555cf1a443e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555cf1a4fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559f8ea45c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #20: + 0x47def4 (0x7f4abe056ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #19: + 0xd2631e (0x7f19635a131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #20: + 0x47def4 (0x7f1962cf8ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559f8ea55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #21: + 0x1445a6 (0x5567a49675a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559f8ea468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5567a4960a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: frame #21: + 0x1445a6 (0x5635aaaae5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #23: + 0x150866 (0x5567a4973866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5567a495c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #51: _PyFunction_Vectorcall + 0x6c (0x560aabbb6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x560aabbaf007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555cf1a3fc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdfca421371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555cf1a4fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5567a4967a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: frame #26: PyObject_Call + 0xbc (0x5567a4973f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdfca421371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #45: + 0x150582 (0x559f8ea61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #46: PyObject_Call + 0xbc (0x559f8ea61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5635aaaa7a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5567a495a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fdf91c2e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fdf91c35610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fdf91c54978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5b325a5610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5b325c4978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559f8ea482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #23: + 0x150866 (0x5635aaaba866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5635aaaa3142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #48: + 0x150582 (0x559f8ea61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5635aaaaea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #12: + 0x5adc309 (0x7f5b6ad83309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #13: + 0x5ae6f10 (0x7f5b6ad8df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: frame #49: PyObject_Call + 0xbc (0x559f8ea61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559f8ea482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559f8ea55a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5567a4967a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5567a49588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: frame #12: + 0x5adc309 (0x7fdfca413309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #13: + 0x5ae6f10 (0x7fdfca41df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #53: _PyObject_Call_Prepend + 0x69 (0x560aabbc0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #14: + 0x5ae6fa5 (0x7f5b6ad8dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #54: + 0x211239 (0x560aabc83239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #15: + 0x5124446 (0x7f5b6a3cb446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #26: PyObject_Call + 0xbc (0x5635aaabaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: frame #14: + 0x5ae6fa5 (0x7fdfca41dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #15: + 0x5124446 (0x7fdfc9a5b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5635aaaa12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #16: + 0x1acf4b8 (0x7f5b66d764b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: frame #17: + 0x5aee004 (0x7f5b6ad95004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #16: + 0x1acf4b8 (0x7fdfc64064b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #17: + 0x5aee004 (0x7fdfca425004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559f8ea4e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #18: + 0x5af36b5 (0x7f5b6ad9a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #30: + 0x150582 (0x5567a4973582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #18: + 0x5af36b5 (0x7fdfca42a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559f8ea5fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #54: + 0x211239 (0x559f8eb22239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: frame #55: PyObject_Call + 0x207 (0x559f8ea62067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559f8ea482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #57: + 0x150582 (0x559f8ea61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5635aaaaea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5635aaa9f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #55: PyObject_Call + 0x207 (0x560aabbc3067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x560aabba92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5567a49588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #19: + 0xd2631e (0x7fdfdd01431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: frame #20: + 0x47def4 (0x7fdfdc76bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555cf1a408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #19: + 0xd2631e (0x7f5b7d98431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #30: + 0x150582 (0x5635aaaba582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #32: + 0x150582 (0x5567a4973582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #45: + 0x150582 (0x555cf1a5b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #21: + 0x1445a6 (0x55a50852e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a508527a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #57: + 0x150582 (0x560aabbc2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: frame #20: + 0x47def4 (0x7f5b7d0dbef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #21: + 0x1445a6 (0x55b0ba17c5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: frame #46: PyObject_Call + 0xbc (0x555cf1a5bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559f8ea468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x560aabba78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555cf1a422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: frame #48: + 0x150582 (0x555cf1a5b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #49: PyObject_Call + 0xbc (0x555cf1a5bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #59: + 0x150582 (0x559f8ea61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #59: + 0x150582 (0x560aabbc2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #60: PyObject_Call + 0xbc (0x560aabbc2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #60: PyObject_Call + 0xbc (0x559f8ea61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x560aabba92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #62: + 0x150582 (0x560aabbc2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559f8ea482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: frame #23: + 0x150866 (0x55a50853a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a508523142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #62: + 0x150582 (0x559f8ea61582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #63: PyObject_Call + 0xbc (0x559f8ea61f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a50852ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555cf1a422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555cf1a4fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #63: PyObject_Call + 0xbc (0x560aabbc2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #26: PyObject_Call + 0xbc (0x55a50853af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b0ba175a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #23: + 0x150866 (0x55b0ba188866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank56]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b0ba171142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5567a49588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #34: + 0x150582 (0x5567a4973582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555cf1a48007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5635aaa9f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5567a49588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a5085212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a50852ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5567a495ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5567a4971c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #32: + 0x150582 (0x5635aaaba582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a50851f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: frame #30: + 0x150582 (0x55a50853a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank63]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a50851f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b0ba17ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5635aaa9f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #34: + 0x150582 (0x5635aaaba582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: frame #32: + 0x150582 (0x55a50853a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a50851f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feabaec8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #34: + 0x150582 (0x55a50853a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5635aaa9f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #1: + 0x5b3a23e (0x7feaf49e523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5635aaaa6f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555cf1a59c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7feaf49dfc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7feaf49dff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #38: + 0x211239 (0x5567a4a34239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7feaf49e0fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #26: PyObject_Call + 0xbc (0x55b0ba188f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feaf4995371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #54: + 0x211239 (0x555cf1b1c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank62]: frame #55: PyObject_Call + 0x207 (0x555cf1a5c067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5567a4960a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5567a495c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feaf4995371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555cf1a422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: dist.recv( -[default0]:[rank48]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feaf4995371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feaf4995371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7feabc1a2189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank58]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5635aaab8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5567a4967a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5567a4957c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7feabc1a9610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b0ba16f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank59]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5567a4967a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7feabc1c8978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5567a49588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #57: + 0x150582 (0x555cf1a5b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8445f61897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank58]: frame #38: + 0x211239 (0x5635aab7b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a50851f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #12: + 0x5adc309 (0x7feaf4987309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a508526f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #13: + 0x5ae6f10 (0x7feaf4991f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #1: + 0x5b3a23e (0x7f847fa7e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a508538c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f847fa78c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #45: + 0x150582 (0x5567a4973582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #46: PyObject_Call + 0xbc (0x5567a4973f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f847fa78f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f847fa79fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555cf1a408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f847fa2e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f847fa2e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b0ba17ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f847fa2e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b0ba16d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #14: + 0x5ae6fa5 (0x7feaf4991fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5567a495a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #48: + 0x150582 (0x5567a4973582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f847fa2e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #30: + 0x150582 (0x55b0ba188582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #15: + 0x5124446 (0x7feaf3fcf446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5635aaaa7a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #59: + 0x150582 (0x555cf1a5b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #49: PyObject_Call + 0xbc (0x5567a4973f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5567a495a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #16: + 0x1acf4b8 (0x7feaf097a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b0ba16d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #17: + 0x5aee004 (0x7feaf4999004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #18: + 0x5af36b5 (0x7feaf499e6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5635aaaa33e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5635aaaaea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5635aaa9ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #19: + 0xd2631e (0x7feb0758831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #20: + 0x47def4 (0x7feb06cdfef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f844723b189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #21: + 0x1445a6 (0x55c2575025a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c2574fba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #23: + 0x150866 (0x55c25750e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f8447242610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c2574f7142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c257502a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #60: PyObject_Call + 0xbc (0x555cf1a5bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #26: PyObject_Call + 0xbc (0x55c25750ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #32: + 0x150582 (0x55b0ba188582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c2574f52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c257502a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5567a4967a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5567a4960007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f8447261978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5567a4971c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b0ba16d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #12: + 0x5adc309 (0x7f847fa20309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c2574f38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #30: + 0x150582 (0x55c25750e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #34: + 0x150582 (0x55b0ba188582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c2574f38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5635aaaaea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #13: + 0x5ae6f10 (0x7f847fa2af10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #32: + 0x150582 (0x55c25750e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c2574f38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5635aaa9f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555cf1a422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #34: + 0x150582 (0x55c25750e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #54: + 0x211239 (0x5567a4a34239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c2574f38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c2574faf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b0ba16d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c25750cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #14: + 0x5ae6fa5 (0x7f847fa2afa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #15: + 0x5124446 (0x7f847f068446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #38: + 0x211239 (0x55a5085fb239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a508527a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #16: + 0x1acf4b8 (0x7f847ba134b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #45: + 0x150582 (0x5635aaaba582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #46: PyObject_Call + 0xbc (0x5635aaabaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #17: + 0x5aee004 (0x7f847fa32004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b0ba174f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #18: + 0x5af36b5 (0x7f847fa376b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #19: + 0xd2631e (0x7f849262131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #55: PyObject_Call + 0x207 (0x5567a4974067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #38: + 0x211239 (0x55c2575cf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5567a495a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #62: + 0x150582 (0x555cf1a5b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c2574fba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c2574f73e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #63: PyObject_Call + 0xbc (0x555cf1a5bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b0ba186c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c257502a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #38: + 0x211239 (0x55b0ba249239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c2574f2c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #20: + 0x47def4 (0x7f8491d78ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5635aaaa12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c257502a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #48: + 0x150582 (0x5635aaaba582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #21: + 0x1445a6 (0x559f35c1d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b0ba175a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c2574f38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #45: + 0x150582 (0x55c25750e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #46: PyObject_Call + 0xbc (0x55c25750ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c2574f52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #48: + 0x150582 (0x55c25750e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559f35c16a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #49: PyObject_Call + 0xbc (0x55c25750ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #23: + 0x150866 (0x559f35c29866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559f35c12142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c2574f52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a5085233e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a50852ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c257502a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559f35c1da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b0ba1713e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c2574fb007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c25750cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #49: PyObject_Call + 0xbc (0x5635aaabaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5635aaaa12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #54: + 0x211239 (0x55c2575cf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #26: PyObject_Call + 0xbc (0x559f35c29f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank54]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559f35c102b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b0ba17ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559f35c1da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5635aaaaea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5635aaaa7007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559f35c0e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a50851ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #55: PyObject_Call + 0x207 (0x55c25750f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a50852ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c2574f52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5635aaab8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #54: + 0x211239 (0x5635aab7b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #30: + 0x150582 (0x559f35c29582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559f35c0e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b0ba16cc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: frame #57: + 0x150582 (0x55c25750e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c2574f38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b0ba17ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: frame #59: + 0x150582 (0x55c25750e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #60: PyObject_Call + 0xbc (0x55c25750ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #57: + 0x150582 (0x5567a4973582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5567a49588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c2574f52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: Traceback (most recent call last): -[default3]:[rank59]: frame #59: + 0x150582 (0x5567a4973582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: frame #60: PyObject_Call + 0xbc (0x5567a4973f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b0ba16d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: frame #45: + 0x150582 (0x55b0ba188582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank58]: frame #55: PyObject_Call + 0x207 (0x5635aaabb067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5635aaaa12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #32: + 0x150582 (0x559f35c29582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559f35c0e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a50851f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #45: + 0x150582 (0x55a50853a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: frame #57: + 0x150582 (0x5635aaaba582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5635aaa9f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #62: + 0x150582 (0x55c25750e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #46: PyObject_Call + 0xbc (0x55b0ba188f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: frame #46: PyObject_Call + 0xbc (0x55a50853af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a5085212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: frame #48: + 0x150582 (0x55a50853a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #63: PyObject_Call + 0xbc (0x55c25750ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #59: + 0x150582 (0x5635aaaba582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #60: PyObject_Call + 0xbc (0x5635aaabaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: frame #34: + 0x150582 (0x559f35c29582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559f35c0e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559f35c15f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #49: PyObject_Call + 0xbc (0x55a50853af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559f35c27c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a5085212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a50852ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #38: + 0x211239 (0x559f35cea239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b0ba16f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559f35c16a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5567a495a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #62: + 0x150582 (0x5567a4973582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default0]:[rank56]: frame #48: + 0x150582 (0x55b0ba188582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a508527007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a508538c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #49: PyObject_Call + 0xbc (0x55b0ba188f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b0ba16f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559f35c123e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559f35c1da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b0ba17ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #54: + 0x211239 (0x55a5085fb239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: sharded_logits = self.model( -[default7]:[rank63]: frame #55: PyObject_Call + 0x207 (0x55a50853b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559f35c0dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b0ba175007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559f35c1da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559f35c0e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a5085212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #57: + 0x150582 (0x55a50853a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a50851f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5635aaaa12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #62: + 0x150582 (0x5635aaaba582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #45: + 0x150582 (0x559f35c29582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b0ba186c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: frame #59: + 0x150582 (0x55a50853a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #60: PyObject_Call + 0xbc (0x55a50853af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: frame #46: PyObject_Call + 0xbc (0x559f35c29f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559f35c102b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #63: PyObject_Call + 0xbc (0x5635aaabaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #48: + 0x150582 (0x559f35c29582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: frame #63: PyObject_Call + 0xbc (0x5567a4973f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: frame #54: + 0x211239 (0x55b0ba249239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default3]:[rank59]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a5085212b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #49: PyObject_Call + 0xbc (0x559f35c29f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #62: + 0x150582 (0x55a50853a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default7]:[rank63]: frame #63: PyObject_Call + 0xbc (0x55a50853af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank54]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559f35c102b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559f35c1da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: frame #55: PyObject_Call + 0x207 (0x55b0ba189067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b0ba16f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559f35c16007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #57: + 0x150582 (0x55b0ba188582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b0ba16d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #59: + 0x150582 (0x55b0ba188582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: frame #60: PyObject_Call + 0xbc (0x55b0ba188f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b0ba16f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #62: + 0x150582 (0x55b0ba188582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #63: PyObject_Call + 0xbc (0x55b0ba188f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559f35c27c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #54: + 0x211239 (0x559f35cea239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default6]:[rank54]: frame #55: PyObject_Call + 0x207 (0x559f35c2a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559f35c102b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: frame #57: + 0x150582 (0x559f35c29582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559f35c0e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #59: + 0x150582 (0x559f35c29582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #60: PyObject_Call + 0xbc (0x559f35c29f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default6]:[rank54]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559f35c102b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #62: + 0x150582 (0x559f35c29582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #63: PyObject_Call + 0xbc (0x559f35c29f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank51]: return func(*args, **kwargs) -[default2]:[rank50]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4dd8a87897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: frame #1: + 0x5b3a23e (0x7f4e125a423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank51]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank51]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6571952897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank51]: frame #1: + 0x5b3a23e (0x7f65ab46f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f65ab469c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4e1259ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f65ab469f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f65ab46afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f65ab41f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f65ab41f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f65ab41f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4e1259ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f65ab41f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6572c2c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6572c33610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4e1259ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4e12554371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6572c52978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #12: + 0x5adc309 (0x7f65ab411309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #13: + 0x5ae6f10 (0x7f65ab41bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4e12554371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4e12554371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4e12554371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #14: + 0x5ae6fa5 (0x7f65ab41bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #15: + 0x5124446 (0x7f65aaa59446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #16: + 0x1acf4b8 (0x7f65a74044b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #17: + 0x5aee004 (0x7f65ab423004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #18: + 0x5af36b5 (0x7f65ab4286b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #19: + 0xd2631e (0x7f65be01231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f4dd9d61189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #20: + 0x47def4 (0x7f65bd769ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4dd9d68610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4dd9d87978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #21: + 0x1445a6 (0x55f8ca2f05a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #12: + 0x5adc309 (0x7f4e12546309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55f8ca2e9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #13: + 0x5ae6f10 (0x7f4e12550f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #23: + 0x150866 (0x55f8ca2fc866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #14: + 0x5ae6fa5 (0x7f4e12550fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #15: + 0x5124446 (0x7f4e11b8e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55f8ca2e5142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55f8ca2f0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #26: PyObject_Call + 0xbc (0x55f8ca2fcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #16: + 0x1acf4b8 (0x7f4e0e5394b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55f8ca2e32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55f8ca2f0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55f8ca2e18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #30: + 0x150582 (0x55f8ca2fc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55f8ca2e18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #17: + 0x5aee004 (0x7f4e12558004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #32: + 0x150582 (0x55f8ca2fc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55f8ca2e18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #18: + 0x5af36b5 (0x7f4e1255d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #19: + 0xd2631e (0x7f4e2514731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #34: + 0x150582 (0x55f8ca2fc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #20: + 0x47def4 (0x7f4e2489eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55f8ca2e18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #21: + 0x1445a6 (0x55bcb06995a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55bcb0692a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #23: + 0x150866 (0x55bcb06a5866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55bcb068e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55f8ca2e8f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55bcb0699a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #26: PyObject_Call + 0xbc (0x55bcb06a5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55bcb068c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55f8ca2fac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55bcb0699a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #38: + 0x211239 (0x55f8ca3bd239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55bcb068a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55f8ca2e9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #30: + 0x150582 (0x55bcb06a5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55f8ca2e53e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55f8ca2f0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55f8ca2e0c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55f8ca2f0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55f8ca2e18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #45: + 0x150582 (0x55f8ca2fc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #46: PyObject_Call + 0xbc (0x55f8ca2fcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55f8ca2e32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #48: + 0x150582 (0x55f8ca2fc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55bcb068a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #49: PyObject_Call + 0xbc (0x55f8ca2fcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55f8ca2e32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55f8ca2f0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #32: + 0x150582 (0x55bcb06a5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55bcb068a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #34: + 0x150582 (0x55bcb06a5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55f8ca2e9007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55bcb068a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55f8ca2fac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #54: + 0x211239 (0x55f8ca3bd239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #55: PyObject_Call + 0x207 (0x55f8ca2fd067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55f8ca2e32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55bcb0691f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #57: + 0x150582 (0x55f8ca2fc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55f8ca2e18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #59: + 0x150582 (0x55f8ca2fc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #60: PyObject_Call + 0xbc (0x55f8ca2fcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55bcb06a3c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #38: + 0x211239 (0x55bcb0766239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55bcb0692a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55f8ca2e32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #62: + 0x150582 (0x55f8ca2fc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #63: PyObject_Call + 0xbc (0x55f8ca2fcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55bcb068e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55bcb0699a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55bcb0689c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55bcb0699a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55bcb068a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #45: + 0x150582 (0x55bcb06a5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #46: PyObject_Call + 0xbc (0x55bcb06a5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55bcb068c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #48: + 0x150582 (0x55bcb06a5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #49: PyObject_Call + 0xbc (0x55bcb06a5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55bcb068c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55bcb0699a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55bcb0692007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55bcb06a3c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #54: + 0x211239 (0x55bcb0766239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #55: PyObject_Call + 0x207 (0x55bcb06a6067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55bcb068c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #57: + 0x150582 (0x55bcb06a5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55bcb068a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #59: + 0x150582 (0x55bcb06a5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #60: PyObject_Call + 0xbc (0x55bcb06a5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55bcb068c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #62: + 0x150582 (0x55bcb06a5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #63: PyObject_Call + 0xbc (0x55bcb06a5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank60]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank60]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc4a058897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank60]: frame #1: + 0x5b3a23e (0x7efc83b7523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7efc83b6fc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7efc83b6ff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7efc83b70fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efc83b25371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efc83b25371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efc83b25371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efc83b25371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7efc4b332189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7efc4b339610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7efc4b358978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #12: + 0x5adc309 (0x7efc83b17309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #13: + 0x5ae6f10 (0x7efc83b21f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #14: + 0x5ae6fa5 (0x7efc83b21fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #15: + 0x5124446 (0x7efc8315f446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #16: + 0x1acf4b8 (0x7efc7fb0a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #17: + 0x5aee004 (0x7efc83b29004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #18: + 0x5af36b5 (0x7efc83b2e6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #19: + 0xd2631e (0x7efc9671831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #20: + 0x47def4 (0x7efc95e6fef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #21: + 0x1445a6 (0x55876d20a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55876d203a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #23: + 0x150866 (0x55876d216866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55876d1ff142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55876d20aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #26: PyObject_Call + 0xbc (0x55876d216f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55876d1fd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55876d20aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55876d1fb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #30: + 0x150582 (0x55876d216582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55876d1fb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #32: + 0x150582 (0x55876d216582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55876d1fb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #34: + 0x150582 (0x55876d216582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55876d1fb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55876d202f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55876d214c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #38: + 0x211239 (0x55876d2d7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55876d203a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55876d1ff3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55876d20aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55876d1fac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55876d20aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55876d1fb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #45: + 0x150582 (0x55876d216582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #46: PyObject_Call + 0xbc (0x55876d216f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55876d1fd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #48: + 0x150582 (0x55876d216582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #49: PyObject_Call + 0xbc (0x55876d216f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55876d1fd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55876d20aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55876d203007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55876d214c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #54: + 0x211239 (0x55876d2d7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #55: PyObject_Call + 0x207 (0x55876d217067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55876d1fd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #57: + 0x150582 (0x55876d216582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55876d1fb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #59: + 0x150582 (0x55876d216582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #60: PyObject_Call + 0xbc (0x55876d216f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55876d1fd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #62: + 0x150582 (0x55876d216582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #63: PyObject_Call + 0xbc (0x55876d216f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: . This may indicate a possible application crash on rank 0 or a network set up issue. -W0703 09:43:22.751000 140380479002432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 567141 closing signal SIGTERM -E0703 09:43:23.193000 140380479002432 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 567140) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:43:22 - host : ip-26-0-169-139.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 567142) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:43:22 - host : ip-26-0-169-139.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 567143) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:43:22 - host : ip-26-0-169-139.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 567144) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:43:22 - host : ip-26-0-169-139.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 567145) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:43:22 - host : ip-26-0-169-139.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 567146) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:43:22 - host : ip-26-0-169-139.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 567147) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:43:22 - host : ip-26-0-169-139.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 567140) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-169-139: task 0: Exited with exit code 1 -W0703 09:43:26.817000 140534699820800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-170-31.ec2.internal_3096567_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.057000 139786108692224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-56.ec2.internal_3427805_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.287000 139836089911040 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-247.ec2.internal_36615_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.305000 139681971750656 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-239.ec2.internal_2556242_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.430000 140054893676288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3976586_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.479000 139998861629184 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-207.ec2.internal_2584734_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.733000 139788562155264 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_964669_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.771000 140004522362688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2584817 closing signal SIGTERM -W0703 09:43:27.771000 140004522362688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2584818 closing signal SIGTERM -W0703 09:43:27.772000 140004522362688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2584819 closing signal SIGTERM -W0703 09:43:27.772000 140004522362688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2584820 closing signal SIGTERM -W0703 09:43:27.772000 140004522362688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2584821 closing signal SIGTERM -W0703 09:43:27.772000 140004522362688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2584822 closing signal SIGTERM -W0703 09:43:27.771000 139791769425728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3427879 closing signal SIGTERM -W0703 09:43:27.771000 139791769425728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3427880 closing signal SIGTERM -W0703 09:43:27.771000 139791769425728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3427881 closing signal SIGTERM -W0703 09:43:27.771000 139687632484160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2556318 closing signal SIGTERM -W0703 09:43:27.771000 139687632484160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2556320 closing signal SIGTERM -W0703 09:43:27.771000 139687632484160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2556321 closing signal SIGTERM -W0703 09:43:27.772000 139687632484160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2556322 closing signal SIGTERM -W0703 09:43:27.772000 139687632484160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2556324 closing signal SIGTERM -W0703 09:43:27.773000 139791769425728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3427882 closing signal SIGTERM -W0703 09:43:27.773000 139791769425728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3427883 closing signal SIGTERM -W0703 09:43:27.774000 139791769425728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3427884 closing signal SIGTERM -W0703 09:43:27.774000 139791769425728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3427885 closing signal SIGTERM -W0703 09:43:27.774000 139791769425728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3427886 closing signal SIGTERM -W0703 09:43:27.773000 140540360554304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3096641 closing signal SIGTERM -W0703 09:43:27.773000 140540360554304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3096642 closing signal SIGTERM -W0703 09:43:27.773000 140540360554304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3096643 closing signal SIGTERM -W0703 09:43:27.774000 140540360554304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3096644 closing signal SIGTERM -W0703 09:43:27.774000 140540360554304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3096645 closing signal SIGTERM -W0703 09:43:27.776000 140540360554304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3096646 closing signal SIGTERM -W0703 09:43:27.776000 140540360554304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3096647 closing signal SIGTERM -W0703 09:43:27.777000 140540360554304 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3096648 closing signal SIGTERM -W0703 09:43:27.778000 140060554409792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3976663 closing signal SIGTERM -W0703 09:43:27.779000 140060554409792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3976664 closing signal SIGTERM -W0703 09:43:27.779000 140060554409792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3976665 closing signal SIGTERM -W0703 09:43:27.780000 140060554409792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3976666 closing signal SIGTERM -W0703 09:43:27.780000 140060554409792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3976667 closing signal SIGTERM -W0703 09:43:27.781000 140060554409792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3976668 closing signal SIGTERM -W0703 09:43:27.782000 140060554409792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3976669 closing signal SIGTERM -W0703 09:43:27.784000 140060554409792 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3976670 closing signal SIGTERM -E0703 09:43:27.807000 139794222888768 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 964745) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:43:27.814000 139794222888768 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_964669_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.840000 139794222888768 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_964669_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.868000 139794222888768 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_964669_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:43:27 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 964746) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:43:27 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 964747) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:43:27 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 964748) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:43:27 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 964749) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:43:27 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 964750) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:43:27 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 964751) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:43:27 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 964752) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:43:27 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 964745) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:43:27.900000 139841750644544 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 36688) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:43:27.906000 139841750644544 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_36615_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.941000 139841750644544 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_36615_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:27.975000 139841750644544 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_36615_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-247.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 36689) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-247.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 36690) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-247.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 36691) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-247.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 36692) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-247.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 36693) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-247.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 36694) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-247.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 36695) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-247.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 36688) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-171-88: task 7: Exited with exit code 1 -srun: error: ip-26-0-169-247: task 3: Exited with exit code 1 -E0703 09:43:29.201000 139687632484160 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2556317) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:43:29.207000 139687632484160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2556242_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:29.235000 139687632484160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2556242_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:29.249000 139687632484160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2556242_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-239.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 2556319) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-239.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 2556323) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-239.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 2556317) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:43:29.296000 140004522362688 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2584816) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:43:29.302000 140004522362688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2584734_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:29.330000 140004522362688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2584734_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:29.345000 140004522362688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2584734_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-207.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 2584823) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:43:27 - host : ip-26-0-169-207.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 2584816) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-169-207: task 1: Exited with exit code 1 -srun: error: ip-26-0-169-239: task 2: Exited with exit code 1 -W0703 09:43:31.821000 140534699820800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-170-31.ec2.internal_3096567_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:31.906000 140540360554304 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-170-31.ec2.internal_3096567_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:31.917000 140540360554304 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-170-31.ec2.internal_3096567_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 09:43:32.061000 139786108692224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-56.ec2.internal_3427805_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-170-31: task 4: Exited with exit code 1 -W0703 09:43:32.435000 140054893676288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3976586_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:33.512000 139791769425728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-56.ec2.internal_3427805_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:33.513000 140060554409792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3976586_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:43:33.522000 139791769425728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-56.ec2.internal_3427805_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent -W0703 09:43:33.524000 140060554409792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3976586_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-56: task 5: Exited with exit code 1 -srun: error: ip-26-0-171-62: task 6: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-128/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/bench.slurm deleted file mode 100644 index 4ce82d9cc33da23edcf2a595cfa3e41c9694a150..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/config.yaml deleted file mode 100644 index ce2b7fcc0d9d2497c0208c43ec4743595c366a63..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 64 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 16 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/log.out deleted file mode 100644 index 1f3aeff87d0e5c73163ff80c6dc74ccdc0511f36..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/log.out +++ /dev/null @@ -1,5943 +0,0 @@ -======================== -START TIME: Wed Jul 3 06:26:57 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 06:27:03.644000 139990378936128 torch/distributed/run.py:757] -W0703 06:27:03.644000 139990378936128 torch/distributed/run.py:757] ***************************************** -W0703 06:27:03.644000 139990378936128 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 06:27:03.644000 139990378936128 torch/distributed/run.py:757] ***************************************** -W0703 06:27:03.735000 139845769660224 torch/distributed/run.py:757] -W0703 06:27:03.735000 139845769660224 torch/distributed/run.py:757] ***************************************** -W0703 06:27:03.735000 139845769660224 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 06:27:03.735000 139845769660224 torch/distributed/run.py:757] ***************************************** -W0703 06:27:03.910000 139623832184640 torch/distributed/run.py:757] -W0703 06:27:03.910000 139623832184640 torch/distributed/run.py:757] ***************************************** -W0703 06:27:03.910000 139623832184640 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 06:27:03.910000 139623832184640 torch/distributed/run.py:757] ***************************************** -W0703 06:27:03.912000 140169910966080 torch/distributed/run.py:757] -W0703 06:27:03.912000 140169910966080 torch/distributed/run.py:757] ***************************************** -W0703 06:27:03.912000 140169910966080 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 06:27:03.912000 140169910966080 torch/distributed/run.py:757] ***************************************** -W0703 06:27:03.966000 140554660448064 torch/distributed/run.py:757] -W0703 06:27:03.966000 140554660448064 torch/distributed/run.py:757] ***************************************** -W0703 06:27:03.966000 140554660448064 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 06:27:03.966000 140554660448064 torch/distributed/run.py:757] ***************************************** -W0703 06:27:04.010000 140627470108480 torch/distributed/run.py:757] -W0703 06:27:04.010000 140627470108480 torch/distributed/run.py:757] ***************************************** -W0703 06:27:04.010000 140627470108480 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 06:27:04.010000 140627470108480 torch/distributed/run.py:757] ***************************************** -W0703 06:27:04.238000 139988192462656 torch/distributed/run.py:757] -W0703 06:27:04.238000 139988192462656 torch/distributed/run.py:757] ***************************************** -W0703 06:27:04.238000 139988192462656 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 06:27:04.238000 139988192462656 torch/distributed/run.py:757] ***************************************** -W0703 06:27:04.415000 140313211619136 torch/distributed/run.py:757] -W0703 06:27:04.415000 140313211619136 torch/distributed/run.py:757] ***************************************** -W0703 06:27:04.415000 140313211619136 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 06:27:04.415000 140313211619136 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 06:27:29 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=2, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=32, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=16, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=64, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16')), -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 06:27:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=24|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=24|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=24|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=27|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=27|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=27|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=29|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=29|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=29|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=31|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=31|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=31|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=16|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=17|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=17|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=16|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=18|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=18|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=18|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=16|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=19|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=19|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=19|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=20|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=21|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=21|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=20|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=23|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=23|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=21|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=20|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=17|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=23|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=22|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=22|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=22|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=28|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=28|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=28|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: No checkpoint path provided. -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: No checkpoint path provided. -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=26|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=26|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: No checkpoint path provided. -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=26|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=25|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=25|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=30|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=25|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=30|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=30|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 06:27:48 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 06:27:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 06:27:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 06:27:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 06:27:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 06:27:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 06:27:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 06:27:51 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 06:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 06:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 06:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 06:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 06:27:53.749996 | mbs: 16 | grad_accum: 64 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 06:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 06:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default0]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=24|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=29|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=28|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=31|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=18|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=24|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=18|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=23|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=28|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=17|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=19|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=21|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=26|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=23|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=26|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=30|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=31|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=29|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=16|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=17|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=22|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=19|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=20|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=21|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 06:27:54 [WARNING|DP=0|PP=1|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=22|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=16|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 06:27:53 [WARNING|DP=0|PP=0|TP=20|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=25|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 06:27:53 [WARNING|DP=0|PP=1|TP=30|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 06:27:54 [WARNING|DP=0|PP=0|TP=25|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 06:27:54 [WARNING|DP=0|PP=0|TP=27|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 06:27:54 [WARNING|DP=0|PP=1|TP=27|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 06:27:54 [WARNING|DP=0|PP=1|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 06:27:54 [WARNING|DP=0|PP=1|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank24]: dist.recv( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: result = loss.backward() -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b7edb3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b8008cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b80091a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b80092dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1bcbb2be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1bd0b72609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1bd093d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b7edb3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b8008cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b80091a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b80092dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1bcbb2be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1bd0b72609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1bd093d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b7edb3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f1b7fd16119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f1bcbb2be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f1bd0b72609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f1bd093d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7d3b95897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff7d4e6ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff7d4e73a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff7d4e74dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff82090de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff825954609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff82571f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7d3b95897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff7d4e6ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff7d4e73a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff7d4e74dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff82090de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff825954609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff82571f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7d3b95897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7ff7d4af8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7ff82090de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7ff825954609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7ff82571f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f28d8563897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f28d983cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f28d9841a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f28d9842dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f29252dbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f292a322609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f292a0ed353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f28d8563897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f28d983cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f28d9841a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f28d9842dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f29252dbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f292a322609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f292a0ed353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f28d8563897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f28d94c6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f29252dbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f292a322609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f292a0ed353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7de9a05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7deacdec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7deace3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7deace4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7e3677de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f7e3b7c4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7e3b58f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7de9a05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7deacdec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7deace3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7deace4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7e3677de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f7e3b7c4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7e3b58f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7de9a05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f7dea968119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f7e3677de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f7e3b7c4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f7e3b58f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4bad1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4bbff3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4bbff8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4bbff9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa507a92e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa50cad9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa50c8a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4bad1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4bbff3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4bbff8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4bbff9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa507a92e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa50cad9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa50c8a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4bad1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fa4bbc7d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fa507a92e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fa50cad9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fa50c8a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcae5d3a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcae7013c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcae7018a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcae7019dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fcb32ab2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fcb37af9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fcb378c4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcae5d3a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcae7013c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcae7018a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcae7019dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fcb32ab2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fcb37af9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fcb378c4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcae5d3a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fcae6c9d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fcb32ab2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fcb37af9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fcb378c4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2cd5fae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2cd7287c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2cd728ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2cd728ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2d22d26e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2d27d6d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2d27b38353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2cd5fae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2cd7287c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2cd728ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2cd728ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2d22d26e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2d27d6d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2d27b38353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2cd5fae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f2cd6f11119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f2d22d26e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f2d27d6d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f2d27b38353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe68d534897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe68e80dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe68e812a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe68e813dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe6da2ace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fe6df2f3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe6df0be353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8edb921897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8edcbfac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8edcbffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8edcc00dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8f28699e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8f2d6e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8f2d4ab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8edb921897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8edcbfac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8edcbffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8edcc00dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8f28699e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8f2d6e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8f2d4ab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8edb921897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f8edc884119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f8f28699e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f8f2d6e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f8f2d4ab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc838f9b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc83a274c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc83a279a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc83a27adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc885d13e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc88ad5a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fc88ab25353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc838f9b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc83a274c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc83a279a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc83a27adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc885d13e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc88ad5a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fc88ab25353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc838f9b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fc839efe119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fc885d13e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fc88ad5a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fc88ab25353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f650249e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6503777c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f650377ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f650377ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f654f216e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f655425d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f6554028353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f650249e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6503777c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f650377ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f650377ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f654f216e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f655425d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f6554028353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f650249e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f6503401119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f654f216e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f655425d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f6554028353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank26]: dist.recv( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f6e6e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9f6f9bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9f6f9c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9f6f9c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f9fbb45ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f9fc04a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f9fc026e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f6e6e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9f6f9bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9f6f9c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9f6f9c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f9fbb45ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f9fc04a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f9fc026e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f6e6e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f9f6f647119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f9fbb45ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f9fc04a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f9fc026e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank19]: dist.recv( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8379e1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd838cbac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34d831a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f34d95f3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f34d95f8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa890ddd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f34d95f9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd838cbfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3525092e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8920b6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8920bba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd838cc0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8920bcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #5: + 0x8609 (0x7f352a0d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3529ea4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #4: + 0xd3e95 (0x7fd884759e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #4: + 0xd3e95 (0x7fa8ddb55e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd8897a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #5: + 0x8609 (0x7fa8e2b9c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd88956b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]:frame #6: clone + 0x43 (0x7fa8e2967353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34d831a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default0]: -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8379e1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd838cbac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f34d95f3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd838cbfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd838cc0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f34d95f8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f34d95f9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd884759e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #4: + 0xd3e95 (0x7f3525092e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd8897a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #5: + 0x8609 (0x7f352a0d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3529ea4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa890ddd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #6: clone + 0x43 (0x7fd88956b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8920b6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8920bba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8920bcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8379e1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #4: + 0xd3e95 (0x7fa8ddb55e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa8e2b9c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #1: + 0xe32119 (0x7fd838944119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #6: clone + 0x43 (0x7fa8e2967353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #2: + 0xd3e95 (0x7fd884759e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]: -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #3: + 0x8609 (0x7fd8897a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fd88956b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34d831a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa890ddd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]: -[default7]:frame #1: + 0xe32119 (0x7f34d927d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f3525092e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #1: + 0xe32119 (0x7fa891d40119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fa8ddb55e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fa8e2b9c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #3: + 0x8609 (0x7f352a0d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f3529ea4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #4: clone + 0x43 (0x7fa8e2967353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe68d534897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe68e80dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe68e812a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe68e813dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe6da2ace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fe6df2f3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe6df0be353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe68d534897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fe68e497119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fe6da2ace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fe6df2f3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fe6df0be353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f78dec12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f78dfeebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f78dfef0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f78dfef1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f792b98ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f79309d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f793079c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f78dec12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f78dfeebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f78dfef0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f78dfef1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f792b98ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f79309d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f793079c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f78dec12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f78dfb75119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f792b98ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f79309d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f793079c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb11fb34897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb120e0dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb120e12a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb120e13dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fb16c8ace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fb1718f3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fb1716be353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb11fb34897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb120e0dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb120e12a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb120e13dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fb16c8ace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fb1718f3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fb1716be353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb11fb34897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fb120a97119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fb16c8ace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fb1718f3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fb1716be353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c1c73a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7c1da13c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7c1da18a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7c1da19dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7c694b2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7c6e4f9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7c6e2c4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c1c73a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7c1da13c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7c1da18a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7c1da19dcc in /fsx/ferdinandmom/miniforge3/envs/env-be[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84b7a6e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84b8d47c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84b8d4ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84b8d4ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f85047e6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f850982d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f85095f8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84b7a6e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84b8d47c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84b8d4ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84b8d4ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f85047e6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f850982d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f85095f8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84b7a6e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f84b89d1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f85047e6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f850982d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f85095f8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47ae9dc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47afcb5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47afcbaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47afcbbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f47fb754e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f480079b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4800566353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47ae9dc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47afcb5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47afcbaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47afcbbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f47fb754e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f480079b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4800566353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47ae9dc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f47af93f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f47fb754e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f480079b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f4800566353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -nch-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7c694b2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7c6e4f9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7c6e2c4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c1c73a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f7c1d69d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f7c694b2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f7c6e4f9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f7c6e2c4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank6]: dist.recv( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d08976897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9d09c4fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9d09c54a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9d09c55dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9d556eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f9d5a735609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f9d5a500353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d08976897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9d09c4fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9d09c54a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9d09c55dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9d556eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f9d5a735609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f9d5a500353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d08976897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f9d098d9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f9d556eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f9d5a735609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f9d5a500353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff8002c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff80159ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff80159fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff8015a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7ff84d039e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7ff852080609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7ff851e4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff8002c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff80159ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff80159fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff8015a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7ff84d039e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7ff852080609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7ff851e4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff8002c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7ff801224119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7ff84d039e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7ff852080609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7ff851e4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c1d3f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c1e6cdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c1e6d2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c1e6d3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4c6a16ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4c6f1b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4c6ef7e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c1d3f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c1e6cdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c1e6d2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c1e6d3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4c6a16ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4c6f1b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4c6ef7e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c1d3f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4c1e357119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f4c6a16ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f4c6f1b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4c6ef7e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f804b358897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f804c631c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f804c636a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f804c637dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f80980d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f809d117609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f809cee2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f804b358897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f804c631c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f804c636a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f804c637dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f80980d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f809d117609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f809cee2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f804b358897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f804c2bb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f80980d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f809d117609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f809cee2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f031c964897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f031dc3dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f031dc42a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f031dc43dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f03696dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f036e723609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f036e4ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f031c964897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f031dc3dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f031dc42a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f031dc43dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f03696dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f036e723609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f036e4ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f031c964897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f031d8c7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f03696dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f036e723609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f036e4ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc6358b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc636b90c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc636b95a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc636b96dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc68262fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc687676609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fc687441353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc6358b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc636b90c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc636b95a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc636b96dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc68262fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc687676609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fc687441353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc6358b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fc63681a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fc68262fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fc687676609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fc687441353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f82c719c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f82c8475c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f82c847aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f82c847bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f8313f14e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f8318f5b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f8318d26353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f82c719c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f82c8475c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f82c847aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f82c847bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f8313f14e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f8318f5b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f8318d26353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f82c719c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:frame #1: + 0xe32119 (0x7f82c80ff119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f8313f14e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f8318f5b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:frame #4: clone + 0x43 (0x7f8318d26353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]: -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f675bbe1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f675cebac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f675cebfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f675cec0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:frame #4: + 0xd3e95 (0x7f67a8959e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:frame #5: + 0x8609 (0x7f67ad9a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f67ad76b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default3]:[rank59]: Traceback (most recent call last): -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1d3b27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1d4e00c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1d4e05a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1d4e06dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:frame #4: + 0xd3e95 (0x7fb22089fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f675bbe1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #5: + 0x8609 (0x7fb2258e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f675cebac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f675cebfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #6: clone + 0x43 (0x7fb2256b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]: -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f675cec0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:frame #4: + 0xd3e95 (0x7f67a8959e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank59]: output = model(**micro_batch) -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default0]:frame #5: + 0x8609 (0x7f67ad9a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #6: clone + 0x43 (0x7f67ad76b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1d3b27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]: -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1d4e00c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f675bbe1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1d4e05a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #1: + 0xe32119 (0x7f675cb44119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:frame #2: + 0xd3e95 (0x7f67a8959e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1d4e06dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default0]:frame #3: + 0x8609 (0x7f67ad9a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: + 0xd3e95 (0x7fb22089fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #4: clone + 0x43 (0x7f67ad76b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #5: + 0x8609 (0x7fb2258e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]: -[default2]:frame #6: clone + 0x43 (0x7fb2256b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]: -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1d3b27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default2]:frame #1: + 0xe32119 (0x7fb1d4a8a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default2]:frame #2: + 0xd3e95 (0x7fb22089fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:frame #3: + 0x8609 (0x7fb2258e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fb2256b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fefec7d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fefedaaec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fefedab3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fefedab4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff03954de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff03e594609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff03e35f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fefec7d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fefedaaec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fefedab3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fefedab4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff03954de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff03e594609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff03e35f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fefec7d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fefed738119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7ff03954de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7ff03e594609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7ff03e35f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f53389c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5339ca2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5339ca7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5339ca8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f5385741e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f538a788609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f538a553353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f53389c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5339ca2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5339ca7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5339ca8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f5385741e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f538a788609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f538a553353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f53389c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f533992c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f5385741e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f538a788609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f538a553353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1514629897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1515902c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1515907a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1515908dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f15613a1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f15663e8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f15661b3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1514629897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1515902c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1515907a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1515908dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f15613a1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f15663e8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f15661b3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1514629897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f151558c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f15613a1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f15663e8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f15661b3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f0bd91897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f0d06ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f0d06fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f0d070dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f2f58b09e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f2f5db50609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f2f5d91b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f0bd91897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f0d06ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f0d06fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f0d070dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f2f58b09e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f2f5db50609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f2f5d91b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f0bd91897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f2f0ccf4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f2f58b09e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f2f5db50609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f2f5d91b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank30]: dist.recv( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbefdb3b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbefee14c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbefee19a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbefee1adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fbf4a8b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fbf4f8fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fbf4f6c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbefdb3b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbefee14c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbefee19a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbefee1adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fbf4a8b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fbf4f8fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fbf4f6c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbefdb3b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fbefea9e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fbf4a8b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fbf4f8fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fbf4f6c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f37167ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3717a87c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3717a8ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3717a8ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f3763526e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f376856d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f3768338353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f37167ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3717a87c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3717a8ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3717a8ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f3763526e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f376856d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f3768338353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f37167ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f3717711119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f3763526e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f376856d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f3768338353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f38d5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f3a035c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f3a03aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f3a03bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2f85ad4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2f8ab1b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2f8a8e6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f38d5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f3a035c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f3a03aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f3a03bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2f85ad4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2f8ab1b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2f8a8e6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f38d5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f2f39cbf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f2f85ad4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f2f8ab1b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f2f8a8e6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2d2486897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa2d375fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa2d3764a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa2d3765dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa31f1fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa324245609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa324010353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2d2486897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa2d375fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa2d3764a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa2d3765dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa31f1fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa324245609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa324010353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2d2486897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fa2d33e9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fa31f1fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fa324245609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fa324010353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f399a93d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f399bc16c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f399bc1ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f399bc1cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f39e76b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f39ec6fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f39ec4c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f399a93d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f399bc16c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f399bc1ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f399bc1cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f39e76b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f39ec6fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f39ec4c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f399a93d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f399b8a0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f39e76b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f39ec6fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f39ec4c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f491aac9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f491bda2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f491bda7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f491bda8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4967841e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f496c888609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f496c653353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f491aac9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f491bda2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f491bda7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f491bda8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4967841e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f496c888609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f496c653353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f491aac9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe5c[default2]:frame #1: + 0xe32119 (0x7f491ba2c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f4967841e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f496c888609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f496c653353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -e983897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe5cfc5cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe5cfc61a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe5cfc62dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe61b6fbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe620742609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe62050d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe5ce983897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe5cfc5cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe5cfc61a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe5cfc62dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe61b6fbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe620742609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe62050d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe5ce983897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fe5cf8e6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fe61b6fbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fe620742609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fe62050d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f30987db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3099ab4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3099ab9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3099abadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f30e5553e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f30ea59a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f30ea365353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f30987db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3099ab4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3099ab9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3099abadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f30e5553e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f30ea59a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f30ea365353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f30987db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f309973e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f30e5553e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f30ea59a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f30ea365353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1da4b3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1da5e17c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1da5e1ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1da5e1ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1df18b6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1df68fd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1df66c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1da4b3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1da5e17c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1da5e1ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1da5e1ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1df18b6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1df68fd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1df66c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1da4b3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f1da5aa1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1df18b6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f1df68fd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f1df66c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc7e8f6b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc7ea244c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc7ea249a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc7ea24adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc835ce3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc83ad2a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fc83aaf5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc7e8f6b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc7ea244c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc7ea249a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc7ea24adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc835ce3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc83ad2a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fc83aaf5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc7e8f6b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fc7e9ece119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fc835ce3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fc83ad2a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fc83aaf5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd02aae3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd02bdbcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd02bdc1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd02bdc2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd07785be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd07c8a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd07c66d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd02aae3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd02bdbcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd02bdc1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd02bdc2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd07785be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd07c8a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd07c66d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd02aae3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd02ba46119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd07785be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd07c8a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd07c66d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1be532f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1be6608c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1be660da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1be660edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1c320a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1c370ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1c36eb9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1be532f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1be6608c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1be660da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1be660edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1c320a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1c370ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1c36eb9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1be532f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f1be6292119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1c320a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f1c370ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f1c36eb9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f312fa62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3130d3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3130d40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3130d41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f317c7dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3181821609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f31815ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f312fa62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3130d3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3130d40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3130d41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f317c7dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3181821609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f31815ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f312fa62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f31309c5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f317c7dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f3181821609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f31815ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7704e09897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f77060e2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f77060e7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f77060e8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7751b81e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7756bc8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7756993353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7704e09897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f77060e2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f77060e7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f77060e8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7751b81e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7756bc8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7756993353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7704e09897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f7705d6c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f7751b81e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f7756bc8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f7756993353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9b6b194897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9b6c46dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9b6c472a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9b6c473dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9bb7f0ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f9bbcf53609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f9bbcd1e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9b6b194897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9b6c46dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9b6c472a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9b6c473dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9bb7f0ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f9bbcf53609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f9bbcd1e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9b6b194897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f9b6c0f7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f9bb7f0ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f9bbcf53609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f9bbcd1e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb293901897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb294bdac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb294bdfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb294be0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb2e0679e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb2e56c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb2e548b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb293901897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb294bdac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb294bdfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb294be0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb2e0679e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb2e56c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb2e548b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb293901897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fb294864119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb2e0679e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb2e56c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb2e548b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff957cd0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff958fa9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff958faea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff958fafdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7ff9a4a48e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7ff9a9a8f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7ff9a985a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff957cd0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff958fa9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff958faea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff958fafdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7ff9a4a48e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7ff9a9a8f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7ff9a985a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff957cd0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7ff958c33119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7ff9a4a48e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7ff9a9a8f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7ff9a985a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2ed9bb7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2edae90c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2edae95a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2edae96dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2f2692fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f2f2b976609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f2f2b741353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2ed9bb7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2edae90c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2edae95a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2edae96dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2f2692fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f2f2b976609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f2f2b741353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2ed9bb7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f2edab1a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f2f2692fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f2f2b976609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f2f2b741353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6a0abb8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6a0be91c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6a0be96a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6a0be97dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f6a57930e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f6a5c977609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f6a5c742353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6a0abb8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6a0be91c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6a0be96a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6a0be97dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f6a57930e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f6a5c977609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f6a5c742353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6a0abb8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f6a0bb1b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f6a57930e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f6a5c977609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f6a5c742353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5e697b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5e6aa8cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5e6aa91a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5e6aa92dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f5eb652be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f5ebb572609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f5ebb33d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5e697b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5e6aa8cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5e6aa91a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5e6aa92dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f5eb652be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f5ebb572609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f5ebb33d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5e697b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f5e6a716119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f5eb652be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f5ebb572609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f5ebb33d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4546636897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f454790fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4547914a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4547915dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f45933aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f45983f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f45981c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4546636897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f454790fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4547914a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4547915dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f45933aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f45983f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f45981c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4546636897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4547599119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f45933aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f45983f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f45981c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8c9663d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8c97916c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8c9791ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8c9791cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8ce33b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8ce83fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8ce81c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8c9663d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8c97916c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8c9791ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8c9791cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f8ce33b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f8ce83fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f8ce81c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8c9663d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f8c975a0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f8ce33b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f8ce83fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f8ce81c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f142d05b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f142e334c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f142e339a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f142e33adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1479dd3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f147ee1a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f147ebe5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f142d05b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f142e334c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f142e339a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f142e33adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1479dd3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f147ee1a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f147ebe5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f142d05b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f142dfbe119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1479dd3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f147ee1a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f147ebe5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7490de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff74a3b7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff74a3bca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff74a3bddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff795e56e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff79ae9d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff79ac68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7490de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff74a3b7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff74a3bca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff74a3bddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04679f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #4: + 0xd3e95 (0x7ff795e56e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff79ae9d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0468ccfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0468cd4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #6: clone + 0x43 (0x7ff79ac68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0468cd5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: -[default0]:frame #4: + 0xd3e95 (0x7f04b476ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7490de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7ff74a041119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7ff795e56e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f04b97b5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f04b9580353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #3: + 0x8609 (0x7ff79ae9d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7ff79ac68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04679f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0468ccfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0468cd4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0468cd5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f04b476ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f04b97b5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f04b9580353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04679f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f0468959119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f04b476ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f04b97b5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f04b9580353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f46c8f13897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f46ca1ecc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f46ca1f1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f46ca1f2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f4715c8be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f471acd2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f471aa9d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f46c8f13897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f46ca1ecc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f46ca1f1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f46ca1f2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f4715c8be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f471acd2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f471aa9d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f46c8f13897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f46c9e76119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f4715c8be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f471acd2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f471aa9d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc08b3e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc08c6c1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc08c6c6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc08c6c7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fc0d8160e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fc0dd1a7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fc0dcf72353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc08b3e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc08c6c1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc08c6c6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc08c6c7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fc0d8160e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fc0dd1a7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fc0dcf72353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc08b3e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fc08c34b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fc0d8160e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fc0dd1a7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fc0dcf72353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c3b93f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9c3cc18c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9c3cc1da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9c3cc1edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9c886b7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9c8d6fe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9c8d4c9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c3b93f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9c3cc18c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9c3cc1da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9c3cc1edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9c886b7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9c8d6fe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9c8d4c9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c3b93f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f9c3c8a2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f9c886b7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f9c8d6fe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f9c8d4c9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f452996f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f452ac48c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f452ac4da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f452ac4edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f45766e7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f457b72e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f457b4f9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f452996f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f452ac48c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f452ac4da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f452ac4edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f45766e7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f457b72e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f457b4f9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f452996f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f452a8d2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f45766e7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f457b72e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f457b4f9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff41e3ce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff41f6a7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff41f6aca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff41f6addcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff46b146e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff47018d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff46ff58353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff41e3ce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff41f6a7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff41f6aca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff41f6addcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff46b146e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff47018d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff46ff58353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff41e3ce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7ff41f331119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7ff46b146e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7ff47018d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7ff46ff58353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce21473897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce2274cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce22751a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce22752dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fce6e1ebe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fce73232609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fce72ffd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce21473897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fce2274cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fce22751a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fce22752dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fce6e1ebe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fce73232609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fce72ffd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fce21473897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fce223d6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fce6e1ebe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fce73232609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fce72ffd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -W0703 06:39:11.166000 139623832184640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3250431 closing signal SIGTERM -W0703 06:39:11.166000 139623832184640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3250432 closing signal SIGTERM -W0703 06:39:11.166000 139623832184640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3250433 closing signal SIGTERM -W0703 06:39:11.167000 139623832184640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3250434 closing signal SIGTERM -W0703 06:39:11.167000 139623832184640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3250435 closing signal SIGTERM -W0703 06:39:11.167000 139623832184640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3250437 closing signal SIGTERM -W0703 06:39:11.167000 139623832184640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3250438 closing signal SIGTERM -W0703 06:39:11.207000 139988192462656 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1893305 closing signal SIGTERM -W0703 06:39:11.207000 139988192462656 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1893307 closing signal SIGTERM -W0703 06:39:11.207000 139988192462656 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1893309 closing signal SIGTERM -W0703 06:39:11.207000 139988192462656 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1893310 closing signal SIGTERM -E0703 06:39:11.328000 139845769660224 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 935247) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 06:39:11.335000 139990378936128 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1170359) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:39:11 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1170360) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1170360 -[2]: - time : 2024-07-03_06:39:11 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1170361) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1170361 -[3]: - time : 2024-07-03_06:39:11 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1170362) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1170362 -[4]: - time : 2024-07-03_06:39:11 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1170363) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1170363 -[5]: - time : 2024-07-03_06:39:11 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1170364) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1170364 -[6]: - time : 2024-07-03_06:39:11 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1170365) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1170365 -[7]: - time : 2024-07-03_06:39:11 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1170366) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1170366 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:39:11 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1170359) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1170359 -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:39:11 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 935248) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 935248 -[2]: - time : 2024-07-03_06:39:11 - host : ip-26-0-172-73.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 935249) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 935249 -[3]: - time : 2024-07-03_06:39:11 - host : ip-26-0-172-73.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 935250) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 935250 -[4]: - time : 2024-07-03_06:39:11 - host : ip-26-0-172-73.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 935251) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 935251 -[5]: - time : 2024-07-03_06:39:11 - host : ip-26-0-172-73.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 935252) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 935252 -[6]: - time : 2024-07-03_06:39:11 - host : ip-26-0-172-73.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 935253) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 935253 -[7]: - time : 2024-07-03_06:39:11 - host : ip-26-0-172-73.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 935254) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 935254 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:39:11 - host : ip-26-0-172-73.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 935247) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 935247 -============================================================ -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -E0703 06:39:12.872000 139988192462656 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1893304) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:39:12.888000 139988192462656 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1893231_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:12.919000 139988192462656 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1893231_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:12.937000 139988192462656 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1893231_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:39:11 - host : ip-26-0-168-238.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 1893306) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1893306 -[2]: - time : 2024-07-03_06:39:11 - host : ip-26-0-168-238.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 1893308) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1893308 -[3]: - time : 2024-07-03_06:39:11 - host : ip-26-0-168-238.ec2.internal - rank : 39 (local_rank: 7) - exitcode : -6 (pid: 1893311) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1893311 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:39:11 - host : ip-26-0-168-238.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 1893304) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1893304 -============================================================ -E0703 06:39:13.395000 139623832184640 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 5 (pid: 3250436) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:39:13.409000 139623832184640 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3250358_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -W0703 06:39:13.439000 139623832184640 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3250358_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:13.447000 139623832184640 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3250358_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:39:11 - host : ip-26-0-163-226.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 3250436) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3250436 -============================================================ -srun: error: ip-26-0-163-226: task 3: Exited with exit code 1 -W0703 06:39:15.493000 140164250232576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-178.ec2.internal_556621_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:15.581000 140621809374976 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1866198_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:15.678000 140548999714560 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1093336_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:15.899000 140307550885632 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-220.ec2.internal_804629_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 06:39:16.263000 140554660448064 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1093408) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:39:16.276000 140554660448064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1093336_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 06:39:16.303000 140169910966080 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 556694) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:39:16.312000 140554660448064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1093336_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:16.316000 140169910966080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_556621_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 06:39:16.328000 140313211619136 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 804703) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 06:39:16.340000 140554660448064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1093336_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -E0703 06:39:16.336000 140627470108480 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1866271) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:39:16 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : -6 (pid: 1093409) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1093409 -[2]: - time : 2024-07-03_06:39:16 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : -6 (pid: 1093410) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1093410 -[3]: - time : 2024-07-03_06:39:16 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : -6 (pid: 1093411) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1093411 -[4]: - time : 2024-07-03_06:39:16 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 1093412) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1093412 -[5]: - time : 2024-07-03_06:39:16 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : -6 (pid: 1093413) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1093413 -[6]: - time : 2024-07-03_06:39:16 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 1093414) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1093414 -[7]: - time : 2024-07-03_06:39:16 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : -6 (pid: 1093415) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1093415 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:39:16 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 1093408) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1093408 -============================================================ -W0703 06:39:16.341000 140313211619136 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_804629_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:16.349000 140627470108480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1866198_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:16.356000 140169910966080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_556621_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:16.371000 140313211619136 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_804629_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:16.378000 140627470108480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1866198_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 06:39:16.385000 140169910966080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_556621_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:39:16 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 556695) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 556695 -[2]: - time : 2024-07-03_06:39:16 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 556696) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 556696 -[3]: - time : 2024-07-03_06:39:16 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 556697) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 556697 -[4]: - time : 2024-07-03_06:39:16 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 556698) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 556698 -[5]: - time : 2024-07-03_06:39:16 - host : ip-26-0-161-178.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 556699) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 556699 -[6]: - time : 2024-07-03_06:39:16 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 556700) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 556700 -[7]: - time : 2024-07-03_06:39:16 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 556701) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 556701 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:39:16 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 556694) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 556694 -============================================================ -W0703 06:39:16.403000 140313211619136 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_804629_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -W0703 06:39:16.407000 140627470108480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1866198_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:39:16 - host : ip-26-0-163-220.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 804704) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 804704 -[2]: - time : 2024-07-03_06:39:16 - host : ip-26-0-163-220.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 804705) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 804705 -[3]: - time : 2024-07-03_06:39:16 - host : ip-26-0-163-220.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 804706) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 804706 -[4]: - time : 2024-07-03_06:39:16 - host : ip-26-0-163-220.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 804707) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 804707 -[5]: - time : 2024-07-03_06:39:16 - host : ip-26-0-163-220.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 804708) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 804708 -[6]: - time : 2024-07-03_06:39:16 - host : ip-26-0-163-220.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 804709) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 804709 -[7]: - time : 2024-07-03_06:39:16 - host : ip-26-0-163-220.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 804710) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 804710 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:39:16 - host : ip-26-0-163-220.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 804703) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 804703 -============================================================ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_06:39:16 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 1866272) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1866272 -[2]: - time : 2024-07-03_06:39:16 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 1866273) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1866273 -[3]: - time : 2024-07-03_06:39:16 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 1866274) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1866274 -[4]: - time : 2024-07-03_06:39:16 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 1866275) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1866275 -[5]: - time : 2024-07-03_06:39:16 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 1866276) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1866276 -[6]: - time : 2024-07-03_06:39:16 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 1866277) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1866277 -[7]: - time : 2024-07-03_06:39:16 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 1866278) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1866278 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_06:39:16 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 1866271) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1866271 -============================================================ -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 2: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-16/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/bench.slurm deleted file mode 100644 index 6a8f80a56443209af3c123676aa458873e8ab14a..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/config.yaml deleted file mode 100644 index b1b6834608d1fa22bae0a3caa8e86d9d51e6af0b..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 512 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 2 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/log.out deleted file mode 100644 index 558e57848b1af2f25e04bad766955483a2b70703..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/log.out +++ /dev/null @@ -1,5814 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:51:45 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:51:48.468000 139787007002432 torch/distributed/run.py:757] -W0703 09:51:48.468000 139787007002432 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.468000 139787007002432 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:51:48.468000 139787007002432 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.469000 140344336561984 torch/distributed/run.py:757] -W0703 09:51:48.469000 140344336561984 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.469000 140344336561984 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:51:48.469000 140344336561984 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.469000 139894485751616 torch/distributed/run.py:757] -W0703 09:51:48.469000 139894485751616 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.469000 139894485751616 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:51:48.469000 139894485751616 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.471000 140673293223744 torch/distributed/run.py:757] -W0703 09:51:48.471000 140673293223744 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.471000 140673293223744 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:51:48.471000 140673293223744 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.472000 140108365117248 torch/distributed/run.py:757] -W0703 09:51:48.472000 140108365117248 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.472000 140108365117248 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:51:48.472000 140108365117248 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.477000 140601061177152 torch/distributed/run.py:757] -W0703 09:51:48.477000 140601061177152 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.477000 140601061177152 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:51:48.477000 140601061177152 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.483000 139815232800576 torch/distributed/run.py:757] -W0703 09:51:48.483000 139815232800576 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.483000 139815232800576 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:51:48.483000 139815232800576 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.584000 140063912851264 torch/distributed/run.py:757] -W0703 09:51:48.584000 140063912851264 torch/distributed/run.py:757] ***************************************** -W0703 09:51:48.584000 140063912851264 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:51:48.584000 140063912851264 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:52:08 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Config: -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: run='%date_%jobid', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: seed=42, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: step=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: consumed_train_samples=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: benchmark_csv_path=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pp=2, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp=32, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pp_engine=, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp_mode=, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: expert_parallel_size=1), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: eos_token_id=2, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_act='silu', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_size=2048, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: initializer_range=0.02, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: intermediate_size=4096, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: is_llama_config=True, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: max_position_embeddings=4096, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_attention_heads=32, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_hidden_layers=24, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_key_value_heads=32, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pad_token_id=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pretraining_tp=1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_scaling=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_theta=10000.0, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tie_word_embeddings=True, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: use_cache=True, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: vocab_size=50272), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer_revision=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer_max_length=None), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoint_interval=100000, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: save_initial_state=False, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: log_level_replica='info', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: train_steps=20, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: micro_batch_size=2, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: batch_accumulation_per_replica=512, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: val_check_interval=-1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: limit_val_batches=0, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: limit_test_batches=0), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: adam_beta1=0.9, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: adam_beta2=0.95, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: name='adamW'), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: zero_stage=1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: weight_decay=0.01, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: clip_grad=1.0, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_warmup_steps=1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_warmup_style='linear', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_style='linear', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_steps=19, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: start_training_step=1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hf_dataset_splits='train', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: text_column_name='text'), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: seed=42, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_loading_workers=0))], -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2')), -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lighteval=None) -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Model Config: -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: eos_token_id=2, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_act='silu', -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_size=2048, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: initializer_range=0.02, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: intermediate_size=4096, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: is_llama_config=True, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: max_position_embeddings=4096, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_attention_heads=32, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_hidden_layers=24, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_key_value_heads=32, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pad_token_id=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pretraining_tp=1, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_scaling=None, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_theta=10000.0, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tie_word_embeddings=True, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: use_cache=True, -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: vocab_size=50272) -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Building model.. -[default0]:07/03/2024 09:52:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Setting PP block ranks... -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=17|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=17|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=17|ip-26-0-169-239]: No checkpoint path provided. -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=21|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=21|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=21|ip-26-0-169-239]: No checkpoint path provided. -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=18|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=18|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=18|ip-26-0-169-239]: No checkpoint path provided. -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=16|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=16|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=16|ip-26-0-169-239]: No checkpoint path provided. -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=20|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=20|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=20|ip-26-0-169-239]: No checkpoint path provided. -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: No checkpoint path provided. -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=5|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=5|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=5|ip-26-0-170-31]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=6|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=6|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=6|ip-26-0-170-31]: No checkpoint path provided. -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: No checkpoint path provided. -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=19|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=19|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=19|ip-26-0-169-239]: No checkpoint path provided. -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=23|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=23|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=23|ip-26-0-169-239]: No checkpoint path provided. -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: No checkpoint path provided. -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: No checkpoint path provided. -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: No checkpoint path provided. -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: No checkpoint path provided. -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Parametrizing model parameters using StandardParametrizator -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-56]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-56]: No checkpoint path provided. -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-56]: No checkpoint path provided. -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: No checkpoint path provided. -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-56]: No checkpoint path provided. -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: No checkpoint path provided. -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: No checkpoint path provided. -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-56]: No checkpoint path provided. -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=7|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=7|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=7|ip-26-0-170-31]: No checkpoint path provided. -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: No checkpoint path provided. -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-56]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=30|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=30|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=30|ip-26-0-169-247]: No checkpoint path provided. -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=27|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=24|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: No checkpoint path provided. -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=24|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=25|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: No checkpoint path provided. -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=25|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=29|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=29|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=29|ip-26-0-169-247]: No checkpoint path provided. -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=27|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=27|ip-26-0-169-247]: No checkpoint path provided. -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=25|ip-26-0-169-247]: No checkpoint path provided. -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-56]: No checkpoint path provided. -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=24|ip-26-0-169-247]: No checkpoint path provided. -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=28|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: No checkpoint path provided. -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=28|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=31|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=31|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=28|ip-26-0-169-247]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: No checkpoint path provided. -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=31|ip-26-0-169-247]: No checkpoint path provided. -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=26|ip-26-0-169-247]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=26|ip-26-0-169-247]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=26|ip-26-0-169-247]: No checkpoint path provided. -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=4|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=4|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=4|ip-26-0-170-31]: No checkpoint path provided. -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: No checkpoint path provided. -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-56]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-56]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-56]: No checkpoint path provided. -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=22|ip-26-0-169-239]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=22|ip-26-0-169-239]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=22|ip-26-0-169-239]: No checkpoint path provided. -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:52:26 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: No checkpoint path provided. -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:52:26 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: No checkpoint path provided. -[default0]:07/03/2024 09:52:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:52:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:52:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 09:52:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:52:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Using `datasets` library -[default0]:07/03/2024 09:52:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:52:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:52:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:52:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: -[default0]:07/03/2024 09:52:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Start training] datetime: 2024-07-03 09:52:29.775427 | mbs: 2 | grad_accum: 512 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:52:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:52:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default1]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=17|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=18|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=16|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=16|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=20|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=17|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=19|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=13|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=3|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=1|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=2|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=23|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=22|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=18|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=11|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=8|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=14|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=26|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=4|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=5|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=24|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=29|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=27|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=4|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=9|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=29|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=25|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=31|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=28|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=15|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=21|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=19|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=1|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=6|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=21|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=10|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=2|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=7|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:52:30 [WARNING|DP=0|PP=0|TP=30|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=28|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=27|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:52:29 [WARNING|DP=0|PP=0|TP=22|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:52:29 [WARNING|DP=0|PP=1|TP=3|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:52:30 [WARNING|DP=0|PP=0|TP=12|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:52:30 [WARNING|DP=0|PP=0|TP=23|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=0|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:52:30 [WARNING|DP=0|PP=0|TP=6|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=24|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:52:30 [WARNING|DP=0|PP=0|TP=20|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=5|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:52:30 [WARNING|DP=0|PP=0|TP=31|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=30|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:52:30 [WARNING|DP=0|PP=1|TP=26|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:52:30 [WARNING|DP=0|PP=0|TP=7|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:53:21 [WARNING|DP=0|PP=0|TP=25|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank6]: dist.recv( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: dist.recv( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: return func(*args, **kwargs) -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank26]: dist.recv( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: pipeline_state.run_communication() -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: Traceback (most recent call last): -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: dist.recv( -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: dist.recv( -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank30]: dist.recv( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab22849897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab23b22c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab23b27a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab23b28dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fab6f5c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fab74608609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fab743d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab22849897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab23b22c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab23b27a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab23b28dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fab6f5c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fab74608609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fab743d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab22849897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fab237ac119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fab6f5c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fab74608609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fab743d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return func(*args, **kwargs) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return func(*args, **kwargs) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59f1d05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59f2fdec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59f2fe3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59f2fe4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5a3ea7de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5a43ac4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5a4388f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59f1d05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59f2fdec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59f2fe3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59f2fe4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5a3ea7de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5a43ac4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5a4388f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59f1d05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f59f2c68119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f5a3ea7de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f5a43ac4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f5a4388f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: dist.recv( -[default5]:[rank37]: trainer.train(dataloader) -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: dist.recv( -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c79ea1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2c7b17ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2c7b17fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2c7b180dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2cc6c19e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2ccbc60609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2ccba2b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83b0f8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83b2263c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83b2268a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83b2269dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f83fdd02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f8402d49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8402b14353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c79ea1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2c7b17ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2c7b17fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2c7b180dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2cc6c19e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83b0f8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83b2263c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83b2268a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #5: + 0x8609 (0x7f2ccbc60609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2ccba2b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83b2269dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f83fdd02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f8402d49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8402b14353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83b0f8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c79ea1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f83b1eed119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f83fdd02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f8402d49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f8402b14353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #1: + 0xe32119 (0x7f2c7ae04119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]: -[default7]:frame #2: + 0xd3e95 (0x7f2cc6c19e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f2ccbc60609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f2ccba2b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb35667f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb357958c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb35795da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb35795edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb3a33f7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb3a843e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb3a8209353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb35667f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb357958c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb35795da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb35795edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb3a33f7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb3a843e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb3a8209353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb35667f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fb3575e2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fb3a33f7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fb3a843e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fb3a8209353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f788bac4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f788cd9dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f788cda2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f788cda3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f78d883ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f78dd883609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f78dd64e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f788bac4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f788cd9dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f788cda2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f788cda3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f78d883ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f78dd883609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f78dd64e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f788bac4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f788ca27119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f78d883ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f78dd883609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f78dd64e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f070ad58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f070c031c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f070c036a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f070c037dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0757ad0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f075cb17609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f075c8e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f070ad58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f070c031c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f070c036a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f070c037dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0757ad0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f075cb17609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f075c8e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f070ad58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f070bcbb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f0757ad0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f075cb17609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f075c8e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f59ee3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f5b1bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f5b1c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f5b1c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2fa6c5be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2fabca2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2faba6d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f59ee3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f5b1bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f5b1c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f5b1c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2fa6c5be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2fabca2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2faba6d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f59ee3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f2f5ae46119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f2fa6c5be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f2fabca2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f2faba6d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc613384897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc61465dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc614662a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc614663dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fc6600fce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fc665143609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fc664f0e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc613384897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc61465dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc614662a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc614663dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fc6600fce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fc665143609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fc664f0e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc613384897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fc6142e7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fc6600fce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fc665143609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fc664f0e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24c930e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24ca5e7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24ca5eca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24ca5eddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f2516086e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f251b0cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f251ae98353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24c930e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24ca5e7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24ca5eca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24ca5eddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f2516086e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f251b0cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f251ae98353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24c930e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f24ca271119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f2516086e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f251b0cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f251ae98353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff97ff52897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff98122bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff981230a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff981231dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff9ccccae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff9d1d11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff9d1adc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff97ff52897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff98122bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff981230a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff981231dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff9ccccae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff9d1d11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff9d1adc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff97ff52897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7ff980eb5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7ff9ccccae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7ff9d1d11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7ff9d1adc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff9da235897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff9db50ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff9db513a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff9db514dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ffa26fade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ffa2bff4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ffa2bdbf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff9da235897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff9db50ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff9db513a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff9db514dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ffa26fade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ffa2bff4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ffa2bdbf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff9da235897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7ff9db198119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7ffa26fade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7ffa2bff4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7ffa2bdbf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e8fb92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e90e6bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e90e70a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e90e71dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2edc90ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:frame #5: + 0x8609 (0x7f2ee1951609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2ee171c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e8fb92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e90e6bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15c6234897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcee51c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f15c750dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e90e70a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcee649dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e90e71dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcee64a2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2edc90ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f15c7512a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcee64a3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7f2ee1951609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2ee171c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f15c7513dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fcf31f3ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fcf36f83609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: + 0xd3e95 (0x7f1612face95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #6: clone + 0x43 (0x7fcf36d4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:frame #5: + 0x8609 (0x7f1617ff3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e8fb92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f2e90af5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f2edc90ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:frame #6: clone + 0x43 (0x7f1617dbe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:frame #3: + 0x8609 (0x7f2ee1951609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f2ee171c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcee51c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]: -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcee649dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcee64a2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcee64a3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15c6234897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #4: + 0xd3e95 (0x7fcf31f3ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fcf36f83609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f15c750dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #6: clone + 0x43 (0x7fcf36d4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f15c7512a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f15c7513dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcee51c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fcee6127119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1612face95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #2: + 0xd3e95 (0x7fcf31f3ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fcf36f83609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fcf36d4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #5: + 0x8609 (0x7f1617ff3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]: -[default2]:frame #6: clone + 0x43 (0x7f1617dbe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15c6234897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f15c7197119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f1612face95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f1617ff3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f1617dbe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e732cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e745a5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e745aaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e745abdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4ec0044e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f4ec508b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f4ec4e56353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e732cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e745a5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e745aaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e745abdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4ec0044e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f4ec508b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f4ec4e56353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e732cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f4e7422f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f4ec0044e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f4ec508b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f4ec4e56353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc1c49fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc1c5cd4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc1c5cd9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc1c5cdadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc211773e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc2167ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc216585353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc1c49fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc1c5cd4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc1c5cd9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc1c5cdadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc211773e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc2167ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc216585353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc1c49fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fc1c595e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fc211773e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fc2167ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fc216585353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbc1afff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbc1c2d8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbc1c2dda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbc1c2dedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fbc67d77e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fbc6cdbe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fbc6cb89353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbc1afff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbc1c2d8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbc1c2dda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbc1c2dedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fbc67d77e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fbc6cdbe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fbc6cb89353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbc1afff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fbc1bf62119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fbc67d77e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fbc6cdbe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fbc6cb89353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe178265897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe17953ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe179543a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe179544dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe1c4fdde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe1ca024609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe1c9def353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe178265897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe17953ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe179543a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe179544dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe1c4fdde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe1ca024609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe1c9def353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe178265897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fe1791c8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fe1c4fdde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fe1ca024609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fe1c9def353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7fbbaaa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7fbcd83c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7fbcd88a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7fbcd89dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default0]:frame #4: + 0xd3e95 (0x7f8008822e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f800d869609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f800d634353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7fbbaaa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7fbcd83c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7fbcd88a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7fbcd89dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f8008822e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:[rank18]: return func(*args, **kwargs) -[default0]:frame #5: + 0x8609 (0x7f800d869609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f800d634353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7fbbaaa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f7fbca0d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f8008822e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f800d869609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f800d634353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54d09f3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f54d1cccc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54d1cd1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54d1cd2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f551d76be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f55227b2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f552257d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54d09f3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f54d1cccc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54d1cd1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54d1cd2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f551d76be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f55227b2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f552257d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54d09f3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f54d1956119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f551d76be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f55227b2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f552257d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb95ec70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb95ff49c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb95ff4ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb95ff4fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb9ab9e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb9b0a2f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb9b07fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb95ec70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb95ff49c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb95ff4ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb95ff4fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb9ab9e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb9b0a2f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb9b07fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb95ec70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fb95fbd3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb9ab9e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb9b0a2f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb9b07fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd775881897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd776b5ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd776b5fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd776b60dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fd7c25f9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fd7c7640609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fd7c740b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd775881897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd776b5ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd776b5fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd776b60dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fd7c25f9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fd7c7640609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fd7c740b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd775881897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fd7767e4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fd7c25f9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fd7c7640609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fd7c740b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef6bca2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef6cf7bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef6cf80a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef6cf81dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fefb8a1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fefbda61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fefbd82c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef6bca2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef6cf7bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef6cf80a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef6cf81dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fefb8a1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fefbda61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fefbd82c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef6bca2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fef6cc05119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fefb8a1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fefbda61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fefbd82c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c6365f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9c64938c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9c6493da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9c6493edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f9cb03d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f9cb541e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f9cb51e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c6365f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9c64938c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9c6493da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9c6493edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f9cb03d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f9cb541e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f9cb51e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c6365f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f9c645c2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f9cb03d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f9cb541e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f9cb51e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: Traceback (most recent call last): -[default2]:[rank50]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return func(*args, **kwargs) -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff45008d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff451366c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff45136ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff45136cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7ff49ce05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7ff4a1e4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7ff4a1c17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff45008d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff451366c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff45136ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff45136cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7ff49ce05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7ff4a1e4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7ff4a1c17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff45008d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7ff450ff0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7ff49ce05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7ff4a1e4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7ff4a1c17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc2b6bc9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc2b7ea2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc2b7ea7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc2b7ea8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc303941e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc308988609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc308753353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc2b6bc9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc2b7ea2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc2b7ea7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc2b7ea8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc303941e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc308988609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc308753353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc2b6bc9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fc2b7b2c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fc303941e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fc308988609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fc308753353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9413e83897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f941515cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9415161a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9415162dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9460bfbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f9465c42609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f9465a0d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9413e83897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f941515cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9415161a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9415162dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9460bfbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f9465c42609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f9465a0d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9413e83897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f9414de6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f9460bfbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f9465c42609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f9465a0d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6afad24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6afbffdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6afc002a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6afc003dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6b47a9ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6b4cae3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6b4c8ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6afad24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6afbffdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6afc002a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6afc003dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6b47a9ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6b4cae3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6b4c8ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6afad24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f6afbc87119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f6b47a9ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f6b4cae3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f6b4c8ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f907452e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9075807c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f907580ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f907580ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f90c12a6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f90c62ed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f90c60b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f907452e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9075807c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f907580ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f907580ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f90c12a6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f90c62ed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f90c60b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f907452e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f9075491119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f90c12a6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f90c62ed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f90c60b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a75cf9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4a76fd2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4a76fd7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4a76fd8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4ac2a71e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4ac7ab8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4ac7883353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a75cf9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4a76fd2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4a76fd7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4a76fd8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4ac2a71e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4ac7ab8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4ac7883353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a75cf9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f4a76c5c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f4ac2a71e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f4ac7ab8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f4ac7883353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63bc9c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe99269b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f63bdc9dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe993974c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f63bdca2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe993979a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe99397adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe9df413e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe9e445a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f63bdca3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f640973ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f640e783609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f640e54e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #6: clone + 0x43 (0x7fe9e4225353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63bc9c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe99269b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f63bdc9dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f63bdca2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f63bdca3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f640973ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe993974c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe993979a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe99397adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe9df413e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f640e783609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f640e54e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #5: + 0x8609 (0x7fe9e445a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe9e4225353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe99269b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63bc9c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fe9935fe119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fe9df413e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #1: + 0xe32119 (0x7f63bd927119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f640973ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fe9e445a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #3: + 0x8609 (0x7f640e783609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f640e54e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #4: clone + 0x43 (0x7fe9e4225353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1eb9356897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1eba62fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1eba634a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1eba635dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1f060cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1f0b115609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1f0aee0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1eb9356897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1eba62fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1eba634a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1eba635dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1f060cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1f0b115609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1f0aee0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1eb9356897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f1eba2b9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f1f060cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f1f0b115609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f1f0aee0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fda040e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fda053bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fda053c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fda053c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fda50e5ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fda55ea5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fda55c70353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fda040e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fda053bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fda053c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fda053c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fda50e5ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fda55ea5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fda55c70353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fda040e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fda05049119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fda50e5ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fda55ea5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fda55c70353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1cbed3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff1cd1acc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff1cd1b1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff1cd1b2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff218c4be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff21dc92609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff21da5d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1cbed3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff1cd1acc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff1cd1b1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff1cd1b2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff218c4be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff21dc92609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff21da5d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1cbed3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7ff1cce36119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7ff218c4be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7ff21dc92609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7ff21da5d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1810e5d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1812136c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f181213ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f181213cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f185dbd5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1862c1c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f18629e7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1810e5d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1812136c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f181213ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f181213cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f185dbd5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1862c1c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f18629e7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1810e5d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f1811dc0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f185dbd5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f1862c1c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f18629e7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a7e74d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a7fa26c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a7fa2ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a7fa2cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f5acb4c5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5ad050c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f5ad02d7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a7e74d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a7fa26c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a7fa2ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a7fa2cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f5acb4c5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5ad050c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f5ad02d7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a7e74d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f5a7f6b0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f5acb4c5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f5ad050c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f5ad02d7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd12de07897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd12f0e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd12f0e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd12f0e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd17ab7fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd17fbc6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd17f991353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd12de07897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd12f0e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd12f0e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd12f0e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd17ab7fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd17fbc6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd17f991353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd12de07897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fd12ed6a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fd17ab7fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fd17fbc6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fd17f991353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb26dd47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb26f020c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb26f025a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb26f026dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fb2baabfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fb2bfb06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb2bf8d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb26dd47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb26f020c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb26f025a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb26f026dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fb2baabfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fb2bfb06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb2bf8d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb26dd47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fb26ecaa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fb2baabfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fb2bfb06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fb2bf8d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dc59a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0dc6c7fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0dc6c84a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0dc6c85dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f0e1271ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0e17765609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0e17530353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dc59a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0dc6c7fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0dc6c84a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0dc6c85dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f0e1271ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0e17765609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0e17530353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dc59a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f0dc6909119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f0e1271ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f0e17765609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f0e17530353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fedfe8ed897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fedffbc6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fedffbcba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fedffbccdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fee4b665e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fee506ac609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fee50477353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fedfe8ed897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fedffbc6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fedffbcba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fedffbccdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fee4b665e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fee506ac609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fee50477353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fedfe8ed897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fedff850119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fee4b665e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fee506ac609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fee50477353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd4cea4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd4e17dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd4e182a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd4e183dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fcd99c1ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fcd9ec63609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fcd9ea2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd4cea4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd4e17dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd4e182a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd4e183dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fcd99c1ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fcd9ec63609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fcd9ea2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd4cea4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fcd4de07119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fcd99c1ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fcd9ec63609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fcd9ea2e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fe1a3d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5fe2d16c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5fe2d1ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5fe2d1cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f602e7b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f60337fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f60335c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fe1a3d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5fe2d16c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5fe2d1ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5fe2d1cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f602e7b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f60337fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f60335c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fe1a3d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f5fe29a0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f602e7b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f60337fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f60335c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa922b92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa923e6bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa923e70a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa923e71dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa96f90ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fa974951609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fa97471c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa922b92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa923e6bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa923e70a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa923e71dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa96f90ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fa974951609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fa97471c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa922b92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fa923af5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fa96f90ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fa974951609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fa97471c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7129576897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f712a84fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f712a854a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f712a855dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f71762eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f717b335609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f717b100353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7129576897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f712a84fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f712a854a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f712a855dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f71762eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f717b335609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f717b100353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7129576897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f712a4d9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f71762eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f717b335609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f717b100353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16ea6f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f16eb9cdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f16eb9d2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f16eb9d3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f173746ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f173c4b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f173c27e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9332b9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16ea6f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f16eb9cdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd934592c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd934597a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd934598dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd980031e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd985078609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd984e43353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9332b9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd934592c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f16eb9d2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd934597a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd934598dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f16eb9d3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f173746ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f173c4b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: + 0xd3e95 (0x7fd980031e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd985078609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd984e43353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:frame #6: clone + 0x43 (0x7f173c27e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9332b9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16ea6f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f16eb657119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: + 0xe32119 (0x7fd93421c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd980031e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd985078609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd984e43353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:frame #2: + 0xd3e95 (0x7f173746ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f173c4b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f173c27e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f08a3c66897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f08a4f3fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f08a4f44a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f08a4f45dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f08f09dee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f08f5a25609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f08f57f0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f08a3c66897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f08a4f3fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f08a4f44a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f08a4f45dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f08f09dee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f08f5a25609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f08f57f0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb6d4c9a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]: -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb6d5f73c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f08a3c66897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f08a4bc9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb6d5f78a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f08f09dee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f08f5a25609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f08f57f0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb6d5f79dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb721a12e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb726a59609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb726824353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb6d4c9a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb6d5f73c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb6d5f78a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb6d5f79dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb721a12e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb726a59609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb726824353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb6d4c9a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fb6d5bfd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fb721a12e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fb726a59609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fb726824353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd3944ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd395785c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd39578aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd39578bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd3e1224e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd3e626b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd3e6036353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd3944ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd395785c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd39578aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd39578bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd3e1224e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd3e626b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd3e6036353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd3944ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fd39540f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fd3e1224e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fd3e626b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fd3e6036353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0cab68f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0cac968c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0cac96da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0cac96edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f0cf8407e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0cfd44e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0cfd219353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0cab68f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0cac968c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0cac96da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0cac96edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f0cf8407e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0cfd44e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0cfd219353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0cab68f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f0cac5f2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f0cf8407e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f0cfd44e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f0cfd219353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcb93f7f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcb95258c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcb9525da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcb9525edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fcbe0cf7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fcbe5d3e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fcbe5b09353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcb93f7f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcb95258c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcb9525da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcb9525edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fcbe0cf7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fcbe5d3e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fcbe5b09353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcb93f7f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fcb94ee2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fcbe0cf7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fcbe5d3e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fcbe5b09353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efdf402c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efdf5305c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efdf530aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efdf530bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7efe40da4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7efe45deb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7efe45bb6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efdf402c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efdf5305c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efdf530aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efdf530bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7efe40da4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7efe45deb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7efe45bb6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efdf402c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7efdf4f8f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7efe40da4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7efe45deb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7efe45bb6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f084f2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8f097cbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8f097d0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8f097d1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8f5526ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8f5a2b1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8f5a07c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f084f2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8f097cbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8f097d0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8f097d1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8f5526ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8f5a2b1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8f5a07c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f084f2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f8f09455119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f8f5526ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f8f5a2b1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f8f5a07c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc629e00897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc62b0d9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc62b0dea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc62b0dfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc676b78e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc67bbbf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc67b98a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc629e00897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc62b0d9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc62b0dea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc62b0dfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc676b78e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc67bbbf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc67b98a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc629e00897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fc62ad63119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fc676b78e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fc67bbbf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fc67b98a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fadd120c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fadd24e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fadd24eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fadd24ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fae1df84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fae22fcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fae22d96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fadd120c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fadd24e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fadd24eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fadd24ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fae1df84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fae22fcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fae22d96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fadd120c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fadd216f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fae1df84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fae22fcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fae22d96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02fd023897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02fe2fcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02fe301a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02fe302dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0349d9be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f034ede2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #6: clone + 0x43 (0x7f034ebad353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b3bd7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6b3d055c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6b3d05aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6b3d05bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f6b88af4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f6b8db3b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #6: clone + 0x43 (0x7f6b8d906353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b3bd7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6b3d055c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02fd023897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02fe2fcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02fe301a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6b3d05aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6b3d05bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02fe302dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f6b88af4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #4: + 0xd3e95 (0x7f0349d9be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f034ede2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f034ebad353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #5: + 0x8609 (0x7f6b8db3b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]: -[default2]:frame #6: clone + 0x43 (0x7f6b8d906353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02fd023897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f02fdf86119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b3bd7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #2: + 0xd3e95 (0x7f0349d9be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f034ede2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f034ebad353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:frame #1: + 0xe32119 (0x7f6b3ccdf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f6b88af4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f6b8db3b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f6b8d906353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7ac79c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7ac8ca2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7ac8ca7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7ac8ca8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f7b14741e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f7b19788609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f7b19553353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7ac79c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7ac8ca2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7ac8ca7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7ac8ca8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f7b14741e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f7b19788609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f7b19553353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7ac79c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f7ac892c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f7b14741e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f7b19788609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f7b19553353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5eaac53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5eabf2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5eabf31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5eabf32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5ef79cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5efca12609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5efc7dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5eaac53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5eabf2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5eabf31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5eabf32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5ef79cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5efca12609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5efc7dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5eaac53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f5eabbb6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f5ef79cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f5efca12609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f5efc7dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05c95d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05ca8b0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05ca8b5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05ca8b6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f061634fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f061b396609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f061b161353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05c95d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05ca8b0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05ca8b5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05ca8b6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f061634fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f061b396609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f061b161353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05c95d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f05ca53a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f061634fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f061b396609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f061b161353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f85e45b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f85e588bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f85e5890a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f85e5891dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f863132ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f8636371609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f863613c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f85e45b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f85e588bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f85e5890a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f85e5891dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f863132ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f8636371609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f863613c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f85e45b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f85e5515119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f863132ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f8636371609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f863613c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feb279de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feb28cb7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feb28cbca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feb28cbddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7feb74756e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7feb7979d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7feb79568353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feb279de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feb28cb7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feb28cbca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feb28cbddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7feb74756e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7feb7979d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7feb79568353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feb279de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7feb28941119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7feb74756e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7feb7979d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7feb79568353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -W0703 10:03:35.451000 140673293223744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967350 closing signal SIGTERM -W0703 10:03:35.451000 140673293223744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967351 closing signal SIGTERM -W0703 10:03:35.451000 140673293223744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967352 closing signal SIGTERM -W0703 10:03:35.452000 140673293223744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967354 closing signal SIGTERM -W0703 10:03:35.452000 140673293223744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967355 closing signal SIGTERM -W0703 10:03:35.452000 140673293223744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967356 closing signal SIGTERM -W0703 10:03:35.452000 140673293223744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967357 closing signal SIGTERM -E0703 10:03:37.455000 140673293223744 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 3 (pid: 967353) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:03:35 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 967353) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 967353 -============================================================ -srun: error: ip-26-0-171-88: task 7: Exited with exit code 1 -W0703 10:03:40.309000 139894485751616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2558866 closing signal SIGTERM -W0703 10:03:40.309000 139894485751616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2558867 closing signal SIGTERM -W0703 10:03:40.309000 139894485751616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2558868 closing signal SIGTERM -W0703 10:03:40.309000 139894485751616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2558869 closing signal SIGTERM -W0703 10:03:40.309000 139894485751616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2558870 closing signal SIGTERM -W0703 10:03:40.309000 139894485751616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2558872 closing signal SIGTERM -W0703 10:03:40.309000 139894485751616 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2558873 closing signal SIGTERM -W0703 10:03:40.398000 139787007002432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 39164 closing signal SIGTERM -W0703 10:03:40.415000 139815232800576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3979164 closing signal SIGTERM -W0703 10:03:40.416000 139815232800576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3979165 closing signal SIGTERM -W0703 10:03:40.416000 139815232800576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3979166 closing signal SIGTERM -W0703 10:03:40.416000 139815232800576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3979167 closing signal SIGTERM -W0703 10:03:40.416000 139815232800576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3979168 closing signal SIGTERM -W0703 10:03:40.416000 139815232800576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3979169 closing signal SIGTERM -W0703 10:03:40.416000 139815232800576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3979171 closing signal SIGTERM -W0703 10:03:40.423000 140063912851264 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3099193 closing signal SIGTERM -W0703 10:03:40.424000 140063912851264 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3099195 closing signal SIGTERM -W0703 10:03:40.424000 140063912851264 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3099199 closing signal SIGTERM -W0703 10:03:40.440000 140344336561984 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3431821 closing signal SIGTERM -W0703 10:03:40.440000 140344336561984 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3431822 closing signal SIGTERM -W0703 10:03:40.440000 140344336561984 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3431823 closing signal SIGTERM -W0703 10:03:40.440000 140344336561984 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3431824 closing signal SIGTERM -W0703 10:03:40.441000 140344336561984 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3431825 closing signal SIGTERM -W0703 10:03:40.441000 140344336561984 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3431826 closing signal SIGTERM -W0703 10:03:40.441000 140344336561984 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3431827 closing signal SIGTERM -E0703 10:03:40.534000 140108365117248 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 570047) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 10:03:40.546000 139787007002432 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 39160) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-139.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 570048) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 570048 -[2]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-139.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 570049) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 570049 -[3]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-139.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 570050) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 570050 -[4]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-139.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 570051) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 570051 -[5]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-139.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 570052) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 570052 -[6]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-139.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 570053) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 570053 -[7]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-139.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 570054) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 570054 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-139.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 570047) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 570047 -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-247.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 39161) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 39161 -[2]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-247.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 39162) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 39162 -[3]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-247.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 39163) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 39163 -[4]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-247.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 39165) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 39165 -[5]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-247.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 39166) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 39166 -[6]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-247.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 39167) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 39167 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-247.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 39160) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 39160 -============================================================ -srun: error: ip-26-0-169-247: task 3: Exited with exit code 1 -srun: error: ip-26-0-169-139: task 0: Exited with exit code 1 -E0703 10:03:41.875000 140063912851264 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3099192) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:03:41.894000 140063912851264 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-170-31.ec2.internal_3099123_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:03:41.940000 140063912851264 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-170-31.ec2.internal_3099123_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:03:41.961000 140063912851264 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-170-31.ec2.internal_3099123_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:03:40 - host : ip-26-0-170-31.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 3099194) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3099194 -[2]: - time : 2024-07-03_10:03:40 - host : ip-26-0-170-31.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 3099196) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3099196 -[3]: - time : 2024-07-03_10:03:40 - host : ip-26-0-170-31.ec2.internal - rank : 37 (local_rank: 5) - exitcode : -6 (pid: 3099197) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3099197 -[4]: - time : 2024-07-03_10:03:40 - host : ip-26-0-170-31.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 3099198) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3099198 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:03:40 - host : ip-26-0-170-31.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 3099192) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3099192 -============================================================ -E0703 10:03:42.290000 139894485751616 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 5 (pid: 2558871) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:03:42.306000 139894485751616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2558795_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:03:42.338000 139894485751616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2558795_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:03:42.346000 139894485751616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2558795_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:03:40 - host : ip-26-0-169-239.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 2558871) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2558871 -============================================================ -E0703 10:03:42.604000 139815232800576 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 6 (pid: 3979170) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:03:42.620000 139815232800576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3979096_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:03:42.650000 139815232800576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3979096_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:03:42.658000 139815232800576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3979096_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:03:40 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 3979170) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3979170 -============================================================ -E0703 10:03:42.965000 140344336561984 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 7 (pid: 3431828) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:03:42.983000 140344336561984 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-56.ec2.internal_3431752_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-170-31: task 4: Exited with exit code 1 -W0703 10:03:43.014000 140344336561984 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-56.ec2.internal_3431752_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:03:43.022000 140344336561984 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-56.ec2.internal_3431752_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:03:40 - host : ip-26-0-171-56.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 3431828) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3431828 -============================================================ -srun: error: ip-26-0-169-239: task 2: Exited with exit code 1 -srun: error: ip-26-0-171-62: task 6: Exited with exit code 1 -srun: error: ip-26-0-171-56: task 5: Exited with exit code 1 -W0703 10:03:45.139000 140595400443648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-207.ec2.internal_2587378_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 10:03:45.628000 140601061177152 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 2587447) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:03:45.641000 140601061177152 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2587378_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:03:45.674000 140601061177152 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2587378_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:03:45.705000 140601061177152 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2587378_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:03:45 - host : ip-26-0-169-207.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 2587448) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2587448 -[2]: - time : 2024-07-03_10:03:45 - host : ip-26-0-169-207.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 2587449) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2587449 -[3]: - time : 2024-07-03_10:03:45 - host : ip-26-0-169-207.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 2587450) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2587450 -[4]: - time : 2024-07-03_10:03:45 - host : ip-26-0-169-207.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 2587451) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2587451 -[5]: - time : 2024-07-03_10:03:45 - host : ip-26-0-169-207.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 2587452) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2587452 -[6]: - time : 2024-07-03_10:03:45 - host : ip-26-0-169-207.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 2587453) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2587453 -[7]: - time : 2024-07-03_10:03:45 - host : ip-26-0-169-207.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 2587454) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2587454 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:03:45 - host : ip-26-0-169-207.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 2587447) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2587447 -============================================================ -srun: error: ip-26-0-169-207: task 1: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-2/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/bench.slurm deleted file mode 100644 index a06af122c7ec8e7e28f629cacdaecf625ff9bd02..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/config.yaml deleted file mode 100644 index cc95f354432d6fcb94d88da4a93f305fe5d8413b..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 4 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 256 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/log.out deleted file mode 100644 index b62490129c372ee64c86d5a00eb5bdcf8f85b0ff..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/log.out +++ /dev/null @@ -1,6350 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:14:28 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:14:33.723000 140026962589504 torch/distributed/run.py:757] -W0703 09:14:33.723000 140026962589504 torch/distributed/run.py:757] ***************************************** -W0703 09:14:33.723000 140026962589504 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:14:33.723000 140026962589504 torch/distributed/run.py:757] ***************************************** -W0703 09:14:33.723000 140611344389952 torch/distributed/run.py:757] -W0703 09:14:33.723000 140611344389952 torch/distributed/run.py:757] ***************************************** -W0703 09:14:33.723000 140611344389952 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:14:33.723000 140611344389952 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.035000 139850729969472 torch/distributed/run.py:757] -W0703 09:14:34.035000 139850729969472 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.035000 139850729969472 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:14:34.035000 139850729969472 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.154000 139796821264192 torch/distributed/run.py:757] -W0703 09:14:34.154000 139796821264192 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.154000 139796821264192 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:14:34.154000 139796821264192 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.612000 139869444958016 torch/distributed/run.py:757] -W0703 09:14:34.612000 139869444958016 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.612000 139869444958016 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:14:34.612000 139869444958016 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.619000 139822731736896 torch/distributed/run.py:757] -W0703 09:14:34.619000 139822731736896 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.619000 139822731736896 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:14:34.619000 139822731736896 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.703000 139672741885760 torch/distributed/run.py:757] -W0703 09:14:34.703000 139672741885760 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.703000 139672741885760 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:14:34.703000 139672741885760 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.743000 140547717470016 torch/distributed/run.py:757] -W0703 09:14:34.743000 140547717470016 torch/distributed/run.py:757] ***************************************** -W0703 09:14:34.743000 140547717470016 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:14:34.743000 140547717470016 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:14:59 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=2, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=32, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=256, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=4, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256')), -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 09:14:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=27|ip-26-0-167-177]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=27|ip-26-0-167-177]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=27|ip-26-0-167-177]: No checkpoint path provided. -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=30|ip-26-0-167-177]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=30|ip-26-0-167-177]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=30|ip-26-0-167-177]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=26|ip-26-0-167-177]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=26|ip-26-0-167-177]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=26|ip-26-0-167-177]: No checkpoint path provided. -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=24|ip-26-0-167-177]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=24|ip-26-0-167-177]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=24|ip-26-0-167-177]: No checkpoint path provided. -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=29|ip-26-0-167-177]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=29|ip-26-0-167-177]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=29|ip-26-0-167-177]: No checkpoint path provided. -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-147]: No checkpoint path provided. -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=31|ip-26-0-167-177]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=31|ip-26-0-167-177]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=31|ip-26-0-167-177]: No checkpoint path provided. -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=25|ip-26-0-167-177]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=28|ip-26-0-167-177]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=28|ip-26-0-167-177]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=28|ip-26-0-167-177]: No checkpoint path provided. -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=25|ip-26-0-167-177]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=25|ip-26-0-167-177]: No checkpoint path provided. -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=23|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=23|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=23|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=16|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=16|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=16|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=20|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=20|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=20|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=18|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=18|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=18|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=17|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=17|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=17|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=22|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=22|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=22|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=21|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=21|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=21|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-147]: No checkpoint path provided. -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=19|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=19|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=19|ip-26-0-166-125]: No checkpoint path provided. -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=0|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=1|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=2|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=7|ip-26-0-164-207]: No checkpoint path provided. -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=4|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:15:18 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=5|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=3|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:15:18 [INFO|DP=0|PP=1|TP=6|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 09:15:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:15:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:15:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 09:15:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:15:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 09:15:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:15:21 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:15:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:15:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:15:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 09:15:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 09:15:22.398455 | mbs: 256 | grad_accum: 4 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:15:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:15:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default0]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=16|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=18|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=26|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=25|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=20|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=4|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=27|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=29|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=28|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=24|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=28|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=25|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=20|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=22|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=17|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=17|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=23|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=23|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=19|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=27|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=19|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=22|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=0|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=7|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=3|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=30|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=24|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=21|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=16|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=30|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=31|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=2|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=6|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=18|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=29|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:15:22 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=5|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:15:22 [WARNING|DP=0|PP=1|TP=26|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:15:23 [WARNING|DP=0|PP=1|TP=21|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:15:23 [WARNING|DP=0|PP=1|TP=1|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:15:23 [WARNING|DP=0|PP=1|TP=31|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:15:27 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank24]: output = self.o_proj(attention_output) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default4]:[rank28]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: return row_linear( -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: sharded_logits = self.model( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/na[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: output = self.o_proj(attention_output) -notron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nano[default4]:[rank28]: output = self.o_proj(attention_output) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -tron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/minif[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -orge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank8]: output = self.o_proj(attention_output) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return row_linear( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.83 GiB is free. Including non-PyTorch memory, this process has 76.49 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.01 GiB is free. Including non-PyTorch memory, this process has 76.31 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line [default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: output = self.o_proj(attention_output) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.83 GiB is free. Including non-PyTorch memory, this process has 76.49 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank17]: output = self.o_proj(attention_output) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.65 GiB is free. Including non-PyTorch memory, this process has 76.67 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default6]:[rank30]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank23]: output = self.o_proj(attention_output) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.65 GiB is free. Including non-PyTorch memory, this process has 76.67 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in[default7]:[rank31]: return forward_call(*args, **kwargs) - forward -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/t[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -ensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.65 GiB is free. Including non-PyTorch memory, this process has 76.67 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: Traceback (most recent call last): -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank29]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: trainer.train(dataloader) -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: Traceback (most recent call last): -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank21]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: output = model(**micro_batch) -[default4]:[rank20]: sharded_logits = self.model( -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: Traceback (most recent call last): -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: output = self.o_proj(attention_output) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: trainer.train(dataloader) -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.01 GiB is free. Including non-PyTorch memory, this process has 76.31 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: sharded_logits = self.model( -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: output = self.o_proj(attention_output) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: output = self.o_proj(attention_output) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank20]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -n/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.76 GiB is free. Including non-PyTorch memory, this process has 76.56 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: return forward_call(*args, **kwargs) - -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/py[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -thon3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: output = self.o_proj(attention_output) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank22]: output = self.o_proj(attention_output) -[default5]:[rank29]: return row_linear( -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return row_linear( -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.76 GiB is free. Including non-PyTorch memory, this process has 76.56 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.01 GiB is free. Including non-PyTorch memory, this process has 76.31 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default6]:[rank22]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.83 GiB is free. Including non-PyTorch memory, this process has 76.49 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: output = self.o_proj(attention_output) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.76 GiB is free. Including non-PyTorch memory, this process has 76.56 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank21]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default1]:[rank25]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.65 GiB is free. Including non-PyTorch memory, this process has 76.67 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.01 GiB is free. Including non-PyTorch memory, this process has 76.31 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: Traceback (most recent call last): -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: trainer.train(dataloader) -[default4]:[rank12]: trainer.train(dataloader) -[default1]:[rank9]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: output = model(**micro_batch) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default1]:[rank9]: sharded_logits = self.model( -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: sharded_logits = self.model( -[default2]:[rank10]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/benc[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -h_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call[default1]:[rank9]: return forward_call(*args, **kwargs) -_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank16]: output = self.o_proj(attention_output) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: return row_linear( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: output = self.o_proj(attention_output) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: return row_linear( -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default4]:[rank12]: output = self.o_proj(attention_output) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: output = self.o_proj(attention_output) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.76 GiB is free. Including non-PyTorch memory, this process has 76.56 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default3]:[rank11]: output = self.o_proj(attention_output) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.65 GiB is free. Including non-PyTorch memory, this process has 76.67 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.65 GiB is free. Including non-PyTorch memory, this process has 76.67 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.76 GiB is free. Including non-PyTorch memory, this process has 76.56 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank18]: output = self.o_proj(attention_output) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.76 GiB is free. Including non-PyTorch memory, this process has 76.56 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.76 GiB is free. Including non-PyTorch memory, this process has 76.56 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.65 GiB is free. Including non-PyTorch memory, this process has 76.67 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.53 GiB is free. Including non-PyTorch memory, this process has 75.79 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.53 GiB is free. Including non-PyTorch memory, this process has 75.79 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: Traceback (most recent call last): -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank3]: output = self.o_proj(attention_output) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank0]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: return row_linear( -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.53 GiB is free. Including non-PyTorch memory, this process has 75.79 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.64 GiB is free. Including non-PyTorch memory, this process has 75.68 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default4]:[rank4]: sharded_logits = self.model( -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: sharded_logits = self.model( -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default1]:[rank1]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: output = self.o_proj(attention_output) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return row_linear( -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default2]:[rank2]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default4]:[rank4]: return row_linear( -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.53 GiB is free. Including non-PyTorch memory, this process has 75.79 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.64 GiB is free. Including non-PyTorch memory, this process has 75.68 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.64 GiB is free. Including non-PyTorch memory, this process has 75.68 GiB memory in use. Of the allocated memory 66.55 GiB is allocated by PyTorch, and 218.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: output = model(**micro_batch) -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: dist.recv( -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return func(*args, **kwargs) -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank62]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb6cab32897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank62]: frame #1: + 0x5b3a23e (0x7fb70464f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb704649c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: dist.recv( -[default0]:[rank56]: return func(*args, **kwargs) -[default6]:[rank62]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb704649f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank57]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank56]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb70464afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb7045ff371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb7045ff371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb7045ff371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9367bec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank56]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2716049897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank56]: frame #1: + 0x5b3a23e (0x7f274fb6623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb7045ff371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb6cbe0c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb6cbe13610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb6cbe32978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #1: + 0x5b3a23e (0x7f93a170923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f93a1703c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #12: + 0x5adc309 (0x7fb7045f1309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f274fb60c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f274fb60f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f93a1703f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f93a1704fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #13: + 0x5ae6f10 (0x7fb7045fbf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93a16b9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93a16b9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93a16b9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93a16b9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f274fb61fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f9368ec6189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f9368ecd610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f9368eec978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f274fb16371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f274fb16371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f274fb16371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #12: + 0x5adc309 (0x7f93a16ab309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #14: + 0x5ae6fa5 (0x7fb7045fbfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #13: + 0x5ae6f10 (0x7f93a16b5f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f274fb16371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2717323189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f271732a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2717349978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #12: + 0x5adc309 (0x7f274fb08309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #15: + 0x5124446 (0x7fb703c39446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #16: + 0x1acf4b8 (0x7fb7005e44b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #13: + 0x5ae6f10 (0x7f274fb12f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #14: + 0x5ae6fa5 (0x7f93a16b5fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #15: + 0x5124446 (0x7f93a0cf3446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #14: + 0x5ae6fa5 (0x7f274fb12fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #17: + 0x5aee004 (0x7fb704603004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #15: + 0x5124446 (0x7f274f150446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #16: + 0x1acf4b8 (0x7f939d69e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #16: + 0x1acf4b8 (0x7f274bafb4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #17: + 0x5aee004 (0x7f274fb1a004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #18: + 0x5af36b5 (0x7f274fb1f6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #17: + 0x5aee004 (0x7f93a16bd004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #19: + 0xd2631e (0x7f276270931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #18: + 0x5af36b5 (0x7f93a16c26b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #19: + 0xd2631e (0x7f93b42ac31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #20: + 0x47def4 (0x7f93b3a03ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #21: + 0x1445a6 (0x56073b6d75a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #18: + 0x5af36b5 (0x7fb7046086b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #19: + 0xd2631e (0x7fb7171f231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56073b6d0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #23: + 0x150866 (0x56073b6e3866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #20: + 0x47def4 (0x7fb716949ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56073b6cc142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #20: + 0x47def4 (0x7f2761e60ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #21: + 0x1445a6 (0x55f0608385a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56073b6d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #21: + 0x1445a6 (0x55b4fec855a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55f060831a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #23: + 0x150866 (0x55f060844866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #26: PyObject_Call + 0xbc (0x56073b6e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56073b6ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56073b6d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56073b6c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #30: + 0x150582 (0x56073b6e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b4fec7ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #23: + 0x150866 (0x55b4fec91866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b4fec7a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b4fec85a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55f06082d142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #26: PyObject_Call + 0xbc (0x55b4fec91f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55f060838a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #26: PyObject_Call + 0xbc (0x55f060844f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b4fec782b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56073b6c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #32: + 0x150582 (0x56073b6e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55f06082b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b4fec85a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56073b6c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #34: + 0x150582 (0x56073b6e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56073b6c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56073b6cff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56073b6e1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #38: + 0x211239 (0x56073b7a4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55f060838a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b4fec768fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #30: + 0x150582 (0x55b4fec91582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b4fec768fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #32: + 0x150582 (0x55b4fec91582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55f0608298fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #30: + 0x150582 (0x55f060844582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b4fec768fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #34: + 0x150582 (0x55b4fec91582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b4fec768fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56073b6d0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b4fec7df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b4fec8fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56073b6cc3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #38: + 0x211239 (0x55b4fed52239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55f0608298fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56073b6d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56073b6c7c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default3]:[rank59]: trainer.train(dataloader) -[default7]:[rank63]: Traceback (most recent call last): -[default1]:[rank57]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56073b6d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -n/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward[default0]:[rank56]: frame #32: + 0x150582 (0x55f060844582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) - -[default6]:[rank62]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b4fec7ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b4fec7a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b4fec85a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tens[default6]:[rank62]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b4fec75c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b4fec85a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b4fec768fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: [1] is setting up NC[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -CL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank43]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank43]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e2d159897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank43]: frame #1: + 0x5b3a23e (0x7f8e66c7623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f8e66c70c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x[default0]:[rank56]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55f0608298fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -32 (0x7f8e66c70f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f8e66c71fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8e66c26371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8e66c26371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8e66c26371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8e66c26371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f8e2e433189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f8e2e43a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f8e2e459978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #12: <[default1]:[rank57]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56073b6c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -unknown function> + 0x5adc309 (0x7f8e66c18309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #13: + 0x5ae6f10 (0x7f8e66c22f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #34: + 0x150582 (0x55f060844582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #14: + 0x5ae6fa5 (0x7f8e66c22fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: trainer.train(dataloader) -[default3]:[rank43]: frame #15: + 0x5124446 (0x7f8e66260446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #16: + 0x1acf4b8 (0x7f8e62c0b4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #17: + 0x5aee004 (0x7f8e66c2a004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: frame #18: + 0x5af36b5 (0x7f8e66c2f6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #19: + 0xd2631e (0x7f8e7981931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #20: + 0x47def4 (0x7f8e78f70ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #21: + 0x1445a6 (0x559e481c45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55f0608298fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55f060830f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: Traceback (most recent call last): -[default3]:[rank43]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559e481bda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #23: + 0x150866 (0x559e481d0866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559e481b9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559e481c4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #26: PyObject_Call + 0xbc (0x559e481d0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559e481b72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55f060842c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559e481c4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559e481b58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #30: + 0x150582 (0x559e481d0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559e481b58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #32: + 0x150582 (0x559e481d0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559e481b58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #34: + 0x150582 (0x559e481d0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: trainer.train(dataloader) -[default3]:[rank43]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559e481b58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559e481bcf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559e481cec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #38: + 0x211239 (0x559e48291239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559e481bda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559e481b93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559e481c4a2c in /fsx/ferdinandmom/miniforge3/envs/en[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -v-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559e481b4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559e481c4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559e481b58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #45: + 0x150582 (0x559e481d0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #46: PyObject_Call + 0xbc (0x559e481d0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559e481b72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: frame #48: + 0x150582 (0x559e481d0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #49: PyObject_Call + 0xbc (0x559e481d0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559e481b72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559e481c4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559e481bd007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559e481cec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #54: + 0x211239 (0x559e48291239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-clu[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -ster/bin/python3.10) -[default3]:[rank43]: frame #55: PyObject_Call + 0x207 (0x559e481d1067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559e481b72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #57: + 0x150582 (0x559e481d0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559e481b58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #59: + 0x150582 (0x559e481d0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #60: PyObject_Call + 0xbc (0x559e481d0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559e481b72b3 in /fsx/ferdinandmom/miniforge3/envs[default1]:[rank57]: frame #45: + 0x150582 (0x56073b6e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: output = model(**micro_batch) -/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #62: + 0x150582 (0x559e481d0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #63: PyObject_Call + 0xbc (0x559e481d0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank62]: frame #45: + 0x150582 (0x55b4fec91582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: output = model(**micro_batch) -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: frame #46: PyObject_Call + 0xbc (0x55b4fec91f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: frame #38: + 0x211239 (0x55f060905239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default0]:[rank56]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55f060831a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b4fec782b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: frame #48: + 0x150582 (0x55b4fec91582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: frame #49: PyObject_Call + 0xbc (0x55b4fec91f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: frame #46: PyObject_Call + 0xbc (0x56073b6e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56073b6ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: sharded_logits = self.model( -[default6]:[rank62]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b4fec782b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55f06082d3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55f060838a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55f060828c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default6]:[rank62]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b4fec85a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55f060838a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: frame #48: + 0x150582 (0x56073b6e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #49: PyObject_Call + 0xbc (0x56073b6e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55f0608298fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #45: + 0x150582 (0x55f060844582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: Traceback (most recent call last): -[default6]:[rank62]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b4fec7e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b4fec8fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: frame #46: PyObject_Call + 0xbc (0x55f060844f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: frame #54: + 0x211239 (0x55b4fed52239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56073b6ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: trainer.train(dataloader) -[default0]:[rank56]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55f06082b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: frame #55: PyObject_Call + 0x207 (0x55b4fec92067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b4fec782b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: frame #48: + 0x150582 (0x55f060844582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5e131de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56073b6d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56073b6d0007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #1: + 0x5b3a23e (0x7f5e4ccfb23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5e4ccf5c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: sharded_logits = self.model( -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: frame #57: + 0x150582 (0x55b4fec91582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank53]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5e4ccf5f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5e4ccf6fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: frame #49: PyObject_Call + 0xbc (0x55f060844f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5e4ccab371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5e4ccab371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5e4ccab371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5e4ccab371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: Traceback (most recent call last): -[default5]:[rank53]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5e144b8189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b4fec768fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: sharded_logits = self.model( -[default5]:[rank53]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5e144bf610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5e144de978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return func(*args, **kwargs) -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56073b6e1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: frame #12: + 0x5adc309 (0x7f5e4cc9d309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank56]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55f06082b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #13: + 0x5ae6f10 (0x7f5e4cca7f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: frame #14: + 0x5ae6fa5 (0x7f5e4cca7fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: frame #54: + 0x211239 (0x56073b7a4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55f060838a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #15: + 0x5124446 (0x7f5e4c2e5446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: frame #16: + 0x1acf4b8 (0x7f5e48c904b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #17: + 0x5aee004 (0x7f5e4ccaf004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: frame #59: + 0x150582 (0x55b4fec91582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #60: PyObject_Call + 0xbc (0x55b4fec91f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #18: + 0x5af36b5 (0x7f5e4ccb46b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #19: + 0xd2631e (0x7f5e5f89e31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default5]:[rank53]: frame #20: + 0x47def4 (0x7f5e5eff5ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default1]:[rank57]: frame #55: PyObject_Call + 0x207 (0x56073b6e4067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default6]:[rank62]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b4fec782b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #21: + 0x1445a6 (0x55bf02a675a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55bf02a60a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55f060831007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd79b23a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: frame #23: + 0x150866 (0x55bf02a73866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #1: + 0x5b3a23e (0x7fd7d4d5723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55bf02a5c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55f060842c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd7d4d51c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd7d4d51f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd7d4d52fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55bf02a67a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56073b6ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #26: PyObject_Call + 0xbc (0x55bf02a73f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf02a5a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd7d4d07371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55bf02a67a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55bf02a588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd7d4d07371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd7d4d07371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd7d4d07371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: frame #30: + 0x150582 (0x55bf02a73582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd79c514189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd79c51b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd79c53a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: frame #54: + 0x211239 (0x55f060905239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55bf02a588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: frame #12: + 0x5adc309 (0x7fd7d4cf9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #13: + 0x5ae6f10 (0x7fd7d4d03f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: frame #32: + 0x150582 (0x55bf02a73582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55bf02a588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #34: + 0x150582 (0x55bf02a73582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: dist.recv( -[default6]:[rank62]: frame #62: + 0x150582 (0x55b4fec91582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55bf02a588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55bf02a5ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55bf02a71c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #38: + 0x211239 (0x55bf02b34239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: frame #55: PyObject_Call + 0x207 (0x55f060845067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55bf02a60a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55bf02a5c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #14: + 0x5ae6fa5 (0x7fd7d4d03fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #15: + 0x5124446 (0x7fd7d4341446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #16: + 0x1acf4b8 (0x7fd7d0cec4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: frame #17: + 0x5aee004 (0x7fd7d4d0b004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank56]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55f06082b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55bf02a67a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: frame #57: + 0x150582 (0x55f060844582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: frame #18: + 0x5af36b5 (0x7fd7d4d106b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #19: + 0xd2631e (0x7fd7e78fa31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55bf02a57c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: frame #63: PyObject_Call + 0xbc (0x55b4fec91f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: frame #20: + 0x47def4 (0x7fd7e7051ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9db92ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: frame #21: + 0x1445a6 (0x561762f175a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #22: _PyObject_MakeTpCall + 0x26b (0x561762f10a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55f0608298fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #23: + 0x150866 (0x561762f23866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55bf02a67a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55bf02a588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: frame #45: + 0x150582 (0x55bf02a73582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: frame #46: PyObject_Call + 0xbc (0x55bf02a73f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default6]:[rank62]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank53]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf02a5a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #48: + 0x150582 (0x55bf02a73582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: frame #49: PyObject_Call + 0xbc (0x55bf02a73f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf02a5a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55bf02a67a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55bf02a60007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #1: + 0x5b3a23e (0x7f9df2dc923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #59: + 0x150582 (0x55f060844582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #60: PyObject_Call + 0xbc (0x55f060844f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55bf02a71c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f9df2dc3c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f9df2dc3f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: output = model(**micro_batch) -[default5]:[rank53]: frame #54: + 0x211239 (0x55bf02b34239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f9df2dc4fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: frame #57: + 0x150582 (0x56073b6e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56073b6c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x561762f0c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9df2d79371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: frame #25: _PyFunction_Vectorcall + 0x6c (0x561762f17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #26: PyObject_Call + 0xbc (0x561762f23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9df2d79371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #59: + 0x150582 (0x56073b6e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x561762f0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #55: PyObject_Call + 0x207 (0x55bf02a74067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9df2d79371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: frame #28: _PyFunction_Vectorcall + 0x6c (0x561762f17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x561762f088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: frame #30: + 0x150582 (0x561762f23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf02a5a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x561762f088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #32: + 0x150582 (0x561762f23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x561762f088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #34: + 0x150582 (0x561762f23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55f06082b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #60: PyObject_Call + 0xbc (0x56073b6e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56073b6ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x561762f088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #57: + 0x150582 (0x55bf02a73582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9df2d79371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: output = model(**micro_batch) -[default5]:[rank53]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55bf02a588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #59: + 0x150582 (0x55bf02a73582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #60: PyObject_Call + 0xbc (0x55bf02a73f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf02a5a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: trainer.train(dataloader) -[default0]:[rank56]: frame #62: + 0x150582 (0x55f060844582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x561762f0ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f9dba586189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #62: + 0x150582 (0x56073b6e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #37: _PyObject_Call_Prepend + 0x69 (0x561762f21c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f9dba58d610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: frame #62: + 0x150582 (0x55bf02a73582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: frame #63: PyObject_Call + 0xbc (0x55bf02a73f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f9dba5ac978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: frame #38: + 0x211239 (0x561762fe4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #39: _PyObject_MakeTpCall + 0x26b (0x561762f10a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: Traceback (most recent call last): -[default3]:[rank59]: pipeline_state.run_communication() -[default5]:[rank53]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x561762f0c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #41: _PyFunction_Vectorcall + 0x6c (0x561762f17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x561762f07c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #43: _PyFunction_Vectorcall + 0x6c (0x561762f17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x561762f088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #45: + 0x150582 (0x561762f23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #46: PyObject_Call + 0xbc (0x561762f23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x561762f0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #48: + 0x150582 (0x561762f23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #49: PyObject_Call + 0xbc (0x561762f23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x561762f0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #51: _PyFunction_Vectorcall + 0x6c (0x561762f17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: frame #12: + 0x5adc309 (0x7f9df2d6b309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #13: + 0x5ae6f10 (0x7f9df2d75f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x561762f10007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #53: _PyObject_Call_Prepend + 0x69 (0x561762f21c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #54: + 0x211239 (0x561762fe4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #55: PyObject_Call + 0x207 (0x561762f24067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x561762f0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #57: + 0x150582 (0x561762f23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x561762f088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: return func(*args, **kwargs) -[default1]:[rank57]: frame #63: PyObject_Call + 0xbc (0x56073b6e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #59: + 0x150582 (0x561762f23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #60: PyObject_Call + 0xbc (0x561762f23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x561762f0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #62: + 0x150582 (0x561762f23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #63: PyObject_Call + 0xbc (0x561762f23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: frame #14: + 0x5ae6fa5 (0x7f9df2d75fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: frame #15: + 0x5124446 (0x7f9df23b3446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: frame #16: + 0x1acf4b8 (0x7f9deed5e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #63: PyObject_Call + 0xbc (0x55f060844f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #17: + 0x5aee004 (0x7f9df2d7d004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank47]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default0]:[rank40]: frame #18: + 0x5af36b5 (0x7f9df2d826b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #19: + 0xd2631e (0x7f9e0596c31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f60c7d4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank56]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: frame #20: + 0x47def4 (0x7f9e050c3ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #21: + 0x1445a6 (0x55bd8eb3e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: frame #1: + 0x5b3a23e (0x7f610186823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: sharded_logits = self.model( -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank57]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6101862c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55bd8eb37a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6101862f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6101863fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6101818371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6101818371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6101818371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: frame #23: + 0x150866 (0x55bd8eb4a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: output = model(**micro_batch) -[default0]:[rank40]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55bd8eb33142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55bd8eb3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #26: PyObject_Call + 0xbc (0x55bd8eb4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6101818371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f60c9025189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd8eb312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f60c902c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55bd8eb3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55bd8eb2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: dist.recv( -[default7]:[rank47]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f60c904b978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #12: + 0x5adc309 (0x7f610180a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: frame #13: + 0x5ae6f10 (0x7f6101814f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: frame #30: + 0x150582 (0x55bd8eb4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: dist.recv( -[default3]:[rank59]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: pipeline_state.run_communication() -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: return func(*args, **kwargs) -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55bd8eb2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: frame #14: + 0x5ae6fa5 (0x7f6101814fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #15: + 0x5124446 (0x7f6100e52446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: frame #32: + 0x150582 (0x55bd8eb4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55bd8eb2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7effc2e20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]: dist.recv( -[default7]:[rank63]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.p[default7]:[rank63]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -y", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call[default3]:[rank59]: frame #1: + 0x5b3a23e (0x7efffc93d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fca839a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #1: + 0x5b3a23e (0x7fcabd4c123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49[default0]:[rank40]: frame #34: + 0x150582 (0x55bd8eb4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7efffc937c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55bd8eb2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fcabd4bbc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank49]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank49]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faf4b431897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank40]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55bd8eb36f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default2]:[rank58]: pipeline_state.run_communication() -[default1]:[rank49]: frame #1: + 0x5b3a23e (0x7faf84f4e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7faf84f48c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7efffc937f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7faf84f48f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7faf84f49fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faf84efe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faf84efe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faf84efe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libto[default7]:[rank47]: frame #16: + 0x1acf4b8 (0x7f60fd7fd4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7efffc938fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -rch_cpu.so) -[default0]:[rank40]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55bd8eb48c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #38: + 0x211239 (0x55bd8ec0b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55bd8eb37a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faf84efe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7faf4c70b189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7faf4c712610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7faf4c731978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #12: <[default4]:[rank44]: return forward_call(*args, **kwargs) -[default3]:[rank59]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efffc8ed371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -unknown function> + 0x5adc309 (0x7faf84ef0309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #17: + 0x5aee004 (0x7f610181c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efffc8ed371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #13: + 0x5ae6f10 (0x7faf84efaf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: frame #14: + 0x5ae6fa5 (0x7faf84efafa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #15: + 0x5124446 (0x7faf84538446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #16: + 0x1acf4b8 (0x7faf80ee34b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #17: + 0x5aee004 (0x7faf84f02004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #18: + 0x5af36b5 (0x7faf84f076b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #19: + 0xd2631e (0x7faf97af1[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: return func(*args, **kwargs) -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #20: + 0x47def4 (0x7faf97248ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #21: + 0x1445a6 (0x56382e1175a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56382e110a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efffc8ed371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #23: + 0x150866 (0x56382e123866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56382e10c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56382e117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #26: PyObject_Call + 0xbc (0x56382e123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56382e10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56382e117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56382e1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cl[default3]:[rank59]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efffc8ed371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: recv_activation_tensor = recv_activation() -uster/bin/python3.10) -[default3]:[rank59]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7effc40fa189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #30: + 0x150582 (0x56382e123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56382e1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #32: + 0x150582 (0x56382e123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56382e1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #34: + 0x150582 (0x56382e123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56382e1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56382e10ff50 in /fsx/ferdinandmom/miniforge3/en[default3]:[rank59]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7effc4101610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -vs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fcabd4bbf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fcabd4bcfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56382e121c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #38: + 0x211239 (0x56382e1e4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56382e110a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56382e10c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56382e117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56382e107c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcabd471371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56382e117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56382e1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #45: + 0x150582 (0x56382e123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #46: PyObject_Call + 0xbc (0x56382e123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56382e10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55bd8eb333e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #18: + 0x5af36b5 (0x7f61018216b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #19: + 0xd2631e (0x7f611440b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #20: + 0x47def4 (0x7f6113b62ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7effc4120978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: frame #48: + 0x150582 (0x56382e123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #49: PyObject_Call + 0xbc (0x56382e123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56382e10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56382e117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56382e110007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55bd8eb3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56382e121c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #54: + 0x211239 (0x56382e1e4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #55: PyObject_Call + 0x207 (0x56382e124067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56382e10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55bd8eb2ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank49]: frame #57: + 0x150582 (0x56382e123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56382e1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #59: + 0x150582 (0x56382e123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #60: PyObject_Call + 0xbc (0x56382e123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56382e10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: frame #62: + 0x150582 (0x56382e123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #63: PyObject_Call + 0xbc (0x56382e123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: frame #21: + 0x1445a6 (0x558e1d4eb5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #22: _PyObject_MakeTpCall + 0x26b (0x558e1d4e4a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #12: + 0x5adc309 (0x7efffc8df309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55bd8eb3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55bd8eb2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcabd471371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faa85b58897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: frame #13: + 0x5ae6f10 (0x7efffc8e9f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcabd471371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #23: + 0x150866 (0x558e1d4f7866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcabd471371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #45: + 0x150582 (0x55bd8eb4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #46: PyObject_Call + 0xbc (0x55bd8eb4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd8eb312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default7]:[rank63]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fca84c7e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fca84c85610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: frame #1: + 0x5b3a23e (0x7faabf67523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fca84ca4978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: frame #14: + 0x5ae6fa5 (0x7efffc8e9fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7faabf66fc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #12: + 0x5adc309 (0x7fcabd463309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: frame #13: + 0x5ae6f10 (0x7fcabd46df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #48: + 0x150582 (0x55bd8eb4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #15: + 0x5124446 (0x7efffbf27446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x558e1d4e0142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank60]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6b52f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: frame #25: _PyFunction_Vectorcall + 0x6c (0x558e1d4eba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: frame #1: + 0x5b3a23e (0x7fa6eee0e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: frame #26: PyObject_Call + 0xbc (0x558e1d4f7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x558e1d4de2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa6eee08c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: frame #49: PyObject_Call + 0xbc (0x55bd8eb4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa6eee08f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #14: + 0x5ae6fa5 (0x7fcabd46dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #15: + 0x5124446 (0x7fcabcaab446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank60]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa6eee09fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd8eb312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #16: + 0x1acf4b8 (0x7fcab94564b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: pipeline_state.run_communication() -[default3]:[rank59]: frame #16: + 0x1acf4b8 (0x7efff88d24b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa6eedbe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55bd8eb3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #28: _PyFunction_Vectorcall + 0x6c (0x558e1d4eba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa6eedbe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: frame #17: + 0x5aee004 (0x7fcabd475004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #17: + 0x5aee004 (0x7efffc8f1004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa6eedbe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: frame #18: + 0x5af36b5 (0x7fcabd47a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #18: + 0x5af36b5 (0x7efffc8f66b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: dist.recv( -[default5]:[rank61]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7faabf66ff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #19: + 0xd2631e (0x7f000f4e031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #20: + 0x47def4 (0x7f000ec37ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #21: + 0x1445a6 (0x558643f595a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #19: + 0xd2631e (0x7fcad006431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa6eedbe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: frame #20: + 0x47def4 (0x7fcacf7bbef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55bd8eb37007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55bd8eb48c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa6b65cb189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa6b65d2610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #54: + 0x211239 (0x55bd8ec0b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #22: _PyObject_MakeTpCall + 0x26b (0x558643f52a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: return func(*args, **kwargs) -[default7]:[rank47]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x558e1d4dc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7faabf670fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faabf625371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #21: + 0x1445a6 (0x560de9b3e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default7]:[rank47]: frame #30: + 0x150582 (0x558e1d4f7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x558e1d4dc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #32: + 0x150582 (0x558e1d4f7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x558e1d4dc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #34: + 0x150582 (0x558e1d4f7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x558e1d4dc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank58]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank59]: frame #23: + 0x150866 (0x558643f65866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa6b65f1978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x558e1d4e3f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faabf625371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #22: _PyObject_MakeTpCall + 0x26b (0x560de9b37a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x558643f4e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef33272897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank60]: frame #12: + 0x5adc309 (0x7fa6eedb0309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #1: + 0x5b3a23e (0x7fef6cd8f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #37: _PyObject_Call_Prepend + 0x69 (0x558e1d4f5c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #38: + 0x211239 (0x558e1d5b8239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #13: + 0x5ae6f10 (0x7fa6eedbaf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: dist.recv( -[default5]:[rank61]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faabf625371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #55: PyObject_Call + 0x207 (0x55bd8eb4b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #39: _PyObject_MakeTpCall + 0x26b (0x558e1d4e4a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faabf625371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7faa86e32189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #25: _PyFunction_Vectorcall + 0x6c (0x558643f59a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a38f7d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7faa86e39610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #23: + 0x150866 (0x560de9b4a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #14: + 0x5ae6fa5 (0x7fa6eedbafa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #15: + 0x5124446 (0x7fa6ee3f8446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7faa86e58978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #26: PyObject_Call + 0xbc (0x558643f65f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fef6cd89c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fef6cd89f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #16: + 0x1acf4b8 (0x7fa6eada34b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fef6cd8afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #12: + 0x5adc309 (0x7faabf617309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x560de9b33142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: frame #1: + 0x5b3a23e (0x7f5a72a9a23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: return func(*args, **kwargs) -[default3]:[rank59]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x558643f4c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x558e1d4e03e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #17: + 0x5aee004 (0x7fa6eedc2004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #25: _PyFunction_Vectorcall + 0x6c (0x560de9b3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fef6cd3f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #26: PyObject_Call + 0xbc (0x560de9b4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5a72a94c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5a72a94f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd8eb312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #41: _PyFunction_Vectorcall + 0x6c (0x558e1d4eba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x558e1d4dbc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #43: _PyFunction_Vectorcall + 0x6c (0x558e1d4eba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #28: _PyFunction_Vectorcall + 0x6c (0x558643f59a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #57: + 0x150582 (0x55bd8eb4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x558643f4a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fef6cd3f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fef6cd3f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #30: + 0x150582 (0x558643f65582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fef6cd3f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x558e1d4dc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fef3454c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55bd8eb2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #59: + 0x150582 (0x55bd8eb4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #60: PyObject_Call + 0xbc (0x55bd8eb4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #45: + 0x150582 (0x558e1d4f7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #46: PyObject_Call + 0xbc (0x558e1d4f7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55bd8eb312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #62: + 0x150582 (0x55bd8eb4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #63: PyObject_Call + 0xbc (0x55bd8eb4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fef34553610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fef34572978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #18: + 0x5af36b5 (0x7fa6eedc76b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #19: + 0xd2631e (0x7fa7019b131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank60]: frame #20: + 0x47def4 (0x7fa701108ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x558643f4a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #13: + 0x5ae6f10 (0x7faabf621f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5a72a95fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x560de9b312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #32: + 0x150582 (0x558643f65582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank59]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x558643f4a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5a72a4a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5a72a4a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default[default7]:[rank47]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x558e1d4de2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #14: + 0x5ae6fa5 (0x7faabf621fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/p[default1]:[rank41]: frame #12: + 0x5adc309 (0x7fef6cd31309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #13: + 0x5ae6f10 (0x7fef6cd3bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #21: + 0x1445a6 (0x563e8797e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -arallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: frame #48: + 0x150582 (0x558e1d4f7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5a72a4a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank33]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank33][default4]:[rank44]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7a2916f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]: frame #14: + 0x5ae6fa5 (0x7fef6cd3bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #15: + 0x5124446 (0x7fef6c379446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5a72a4a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #28: _PyFunction_Vectorcall + 0x6c (0x560de9b3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4357e40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank33]: frame #1: + 0x5b3a23e (0x7f439195d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4391957c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4391957f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4391958fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/si[default7]:[rank47]: frame #49: PyObject_Call + 0xbc (0x558e1d4f7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #16: + 0x1acf4b8 (0x7fef68d244b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5a3a257189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -te-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f439190d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f439190d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f439190d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f439190d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f435911a189 in /fsx[default1]:[rank41]: frame #17: + 0x5aee004 (0x7fef6cd43004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x558e1d4de2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #15: + 0x5124446 (0x7faabec5f446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #16: + 0x1acf4b8 (0x7faabb60a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: sharded_logits = self.model( -/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4359121610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4359140978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #12: + 0x5adc309 (0x7f43918ff309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #13: + 0x5ae6f10 (0x7f4391909f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #1: + 0x5b3a23e (0x7f7a62c8c23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563e87977a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: frame #14: + 0x5ae6fa5 (0x7f4391909fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f7a62c86c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #51: _PyFunction_Vectorcall + 0x6c (0x558e1d4eba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #34: + 0x150582 (0x558643f65582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #17: + 0x5aee004 (0x7faabf629004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5a3a25e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5a3a27d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: frame #15: + 0x5124446 (0x7f4390f47446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #16: + 0x1acf4b8 (0x7f438d8f24b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #17: + 0x5aee004 (0x7f4391911004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #18: + 0x5af36b5 (0x7f43919166b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #19: + 0xd2631e (0x7f43a450031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #20: + 0x47def4 (0x7f43a3c[default7]:[rank47]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x558e1d4e4007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x558643f4a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x560de9b2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -57ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7a62c86f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7a62c87fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #30: + 0x150582 (0x560de9b4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #21: + 0x1445a6 (0x563bb048a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563bb0483a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #23: + 0x150866 (0x563bb0496866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563bb047f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #53: _PyObject_Call_Prepend + 0x69 (0x558e1d4f5c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #18: + 0x5af36b5 (0x7faabf62e6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563bb048aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #26: PyObject_Call + 0xbc (0x563bb0496f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563bb047d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563bb048aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563bb047b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #12: + 0x5adc309 (0x7f5a72a3c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #13: + 0x5ae6f10 (0x7f5a72a46f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #30: + 0x150582 (0x563bb0496582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563bb047b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #32: + 0x150582 (0x563bb0496582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563bb047b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x558643f51f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #37: _PyObject_Call_Prepend + 0x69 (0x558643f63c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #38: + 0x211239 (0x558644026239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: frame #34: + 0x150582 (0x563bb0496582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563bb047b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563bb0482f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563bb0494c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #38: + 0x211239 (0x563bb0557239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563bb0483a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563bb047f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default1]:[rank33]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563bb048aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563bb047ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563bb048aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563bb047b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #45: + 0x150582 (0x563bb0496582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #46: PyObject_Call + 0xbc (0x563bb0496f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563bb047d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-clu[default1]:[rank41]: frame #18: + 0x5af36b5 (0x7fef6cd486b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #19: + 0xd2631e (0x7fef7f93231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -ster/bin/python3.10) -[default1]:[rank33]: frame #48: + 0x150582 (0x563bb0496582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #49: PyObject_Call + 0xbc (0x563bb0496f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563bb047d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563bb048aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563bb0483007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #20: + 0x47def4 (0x7fef7f089ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #54: + 0x211239 (0x558e1d5b8239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #55: PyObject_Call + 0x207 (0x558e1d4f8067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563bb0494c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #54: + 0x211239 (0x563bb0557239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #55: PyObject_Call + 0x207 (0x563bb0497067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563bb047d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #21: + 0x1445a6 (0x56355f6ac5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7a62c3c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x558e1d4de2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #57: + 0x150582 (0x558e1d4f7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #14: + 0x5ae6fa5 (0x7f5a72a46fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #15: + 0x5124446 (0x7f5a72084446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #16: + 0x1acf4b8 (0x7f5a6ea2f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: frame #57: + 0x150582 (0x563bb0496582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563bb047b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #59: + 0x150582 (0x563bb0496582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #60: PyObject_Call + 0xbc (0x563bb0496f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563bb047d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56355f6a5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7a62c3c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #17: + 0x5aee004 (0x7f5a72a4e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #39: _PyObject_MakeTpCall + 0x26b (0x558643f52a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: frame #62: + 0x150582 (0x563bb0496582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #63: PyObject_Call + 0xbc (0x563bb0496f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x558e1d4dc8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x560de9b2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default4]:[rank44]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7a62c3c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #32: + 0x150582 (0x560de9b4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7a62c3c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #18: + 0x5af36b5 (0x7f5a72a536b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: frame #59: + 0x150582 (0x558e1d4f7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #19: + 0xd2631e (0x7faad221831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f7a2a449189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #23: + 0x150866 (0x563e8798a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563e87973142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: pipeline_state.run_communication() -[default1]:[rank41]: frame #23: + 0x150866 (0x56355f6b8866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56355f6a1142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #20: + 0x47def4 (0x7faad196fef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x560de9b2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default4]:[rank44]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f7a2a450610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f7a2a46f978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563e8797ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default1]:[rank41]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56355f6aca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x558643f4e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: frame #60: PyObject_Call + 0xbc (0x558e1d4f7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x558e1d4de2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #34: + 0x150582 (0x560de9b4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x560de9b2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: frame #26: PyObject_Call + 0xbc (0x56355f6b8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #12: + 0x5adc309 (0x7f7a62c2e309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #13: + 0x5ae6f10 (0x7f7a62c38f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #26: PyObject_Call + 0xbc (0x563e8798af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56355f69f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #62: + 0x150582 (0x558e1d4f7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #14: + 0x5ae6fa5 (0x7f7a62c38fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #21: + 0x1445a6 (0x559bbf4155a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x560de9b36f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563e879712b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563e8797ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559bbf40ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #19: + 0xd2631e (0x7f5a8563d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #20: + 0x47def4 (0x7f5a84d94ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: frame #41: _PyFunction_Vectorcall + 0x6c (0x558643f59a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x558643f49c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563e8796f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #30: + 0x150582 (0x563e8798a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: frame #63: PyObject_Call + 0xbc (0x558e1d4f7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #15: + 0x5124446 (0x7f7a62276446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56355f6aca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #23: + 0x150866 (0x559bbf421866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: frame #16: + 0x1acf4b8 (0x7f7a5ec214b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #17: + 0x5aee004 (0x7f7a62c40004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #37: _PyObject_Call_Prepend + 0x69 (0x560de9b48c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank44]: frame #18: + 0x5af36b5 (0x7f7a62c456b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56355f69d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #38: + 0x211239 (0x560de9c0b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559bbf40a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: frame #19: + 0xd2631e (0x7f7a7582f31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #20: + 0x47def4 (0x7f7a74f86ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #21: + 0x1445a6 (0x555f7debf5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return func(*args, **kwargs) -[default4]:[rank44]: frame #21: + 0x1445a6 (0x55597eb665a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #30: + 0x150582 (0x56355f6b8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56355f69d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563e8796f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55597eb5fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #43: _PyFunction_Vectorcall + 0x6c (0x558643f59a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #39: _PyObject_MakeTpCall + 0x26b (0x560de9b37a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555f7deb8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #23: + 0x150866 (0x555f7decb866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555f7deb4142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: frame #23: + 0x150866 (0x55597eb72866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x560de9b333e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559bbf415a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55597eb5b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555f7debfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55597eb66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #41: _PyFunction_Vectorcall + 0x6c (0x560de9b3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: frame #26: PyObject_Call + 0xbc (0x55597eb72f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #32: + 0x150582 (0x56355f6b8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x558643f4a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55597eb592b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55597eb66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55597eb578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #30: + 0x150582 (0x55597eb72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #32: + 0x150582 (0x563e8798a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #26: PyObject_Call + 0xbc (0x555f7decbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #26: PyObject_Call + 0xbc (0x559bbf421f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55597eb578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56355f69d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559bbf4082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: frame #34: + 0x150582 (0x56355f6b8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563e8796f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: pipeline_state.run_communication() -[default0]:[rank48]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank41]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56355f69d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #45: + 0x150582 (0x558643f65582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank44]: frame #32: + 0x150582 (0x55597eb72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555f7deb22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #34: + 0x150582 (0x563e8798a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f55b2721897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55597eb578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #34: + 0x150582 (0x55597eb72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563e8796f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x560de9b2ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #43: _PyFunction_Vectorcall + 0x6c (0x560de9b3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55597eb578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555f7debfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #1: + 0x5b3a23e (0x7f55ec23e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55597eb5ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56355f6a4f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56355f6b6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559bbf415a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x560de9b2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f55ec238c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: frame #45: + 0x150582 (0x560de9b4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default2]:[rank58]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555f7deb08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -n/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward[default2]:[rank58]: frame #30: + 0x150582 (0x555f7decb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta - -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559bbf4068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #46: PyObject_Call + 0xbc (0x560de9b4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f55ec238f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f55ec239fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_[default4]:[rank44]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55597eb70c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #38: + 0x211239 (0x55597ec33239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55597eb5fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555f7deb08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f55ec1ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f55ec1ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55597eb5b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55597eb66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55597eb56c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55597eb66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #32: + 0x150582 (0x555f7decb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f55ec1ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: return func(*args, **kwargs) -[default4]:[rank44]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55597eb578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #45: + 0x150582 (0x55597eb72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #46: PyObject_Call + 0xbc (0x55597eb72f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #30: + 0x150582 (0x559bbf421582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55597eb592b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #48: + 0x150582 (0x55597eb72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559bbf4068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f55ec1ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f55b39fb189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: frame #38: + 0x211239 (0x56355f779239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56355f6a5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56355f6a13e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #46: PyObject_Call + 0xbc (0x558643f65f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: Traceback (most recent call last): -[default1]:[rank41]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56355f6aca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x558643f4c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return func(*args, **kwargs) -[default0]:[rank48]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f55b3a02610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f55b3a21978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default4]:[rank44]: frame #49: PyObject_Call + 0xbc (0x55597eb72f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55597eb592b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #32: + 0x150582 (0x559bbf421582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555f7deb08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #12: + 0x5adc309 (0x7f55ec1e0309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56355f69cc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x560de9b312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56355f6aca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563e87976f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank51]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56355f69d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #34: + 0x150582 (0x555f7decb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #48: + 0x150582 (0x558643f65582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd763143897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank51]: frame #1: + 0x5b3a23e (0x7fd79cc6023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: frame #45: + 0x150582 (0x56355f6b8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #49: PyObject_Call + 0xbc (0x558643f65f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #13: + 0x5ae6f10 (0x7f55ec1eaf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: frame #46: PyObject_Call + 0xbc (0x56355f6b8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563e87988c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd79cc5ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56355f69f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559bbf4068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #14: + 0x5ae6fa5 (0x7f55ec1eafa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default4]:[rank44]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55597eb66a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555f7deb08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd79cc5af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #48: + 0x150582 (0x56355f6b8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #49: PyObject_Call + 0xbc (0x56355f6b8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x558643f4c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #34: + 0x150582 (0x559bbf421582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #38: + 0x211239 (0x563e87a4b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #15: + 0x5124446 (0x7f55eb828446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55597eb5f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55597eb70c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #54: + 0x211239 (0x55597ec33239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555f7deb7f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd79cc5bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd79cc10371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #55: PyObject_Call + 0x207 (0x55597eb73067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55597eb592b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555f7dec9c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd79cc10371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd79cc10371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56355f69f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #48: + 0x150582 (0x560de9b4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563e87977a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #16: + 0x1acf4b8 (0x7f55e81d34b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: Traceback (most recent call last): -[default1]:[rank41]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56355f6aca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #38: + 0x211239 (0x555f7df8c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #17: + 0x5aee004 (0x7f55ec1f2004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd79cc10371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd76441d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56355f6a5007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555f7deb8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #49: PyObject_Call + 0xbc (0x560de9b4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd764424610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56355f6b6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #57: + 0x150582 (0x55597eb72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563e879733e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #18: + 0x5af36b5 (0x7f55ec1f76b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55597eb578fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #51: _PyFunction_Vectorcall + 0x6c (0x558643f59a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559bbf4068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #19: + 0xd2631e (0x7f55fede131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #20: + 0x47def4 (0x7f55fe538ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #21: + 0x1445a6 (0x55de52c175a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank44]: frame #59: + 0x150582 (0x55597eb72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555f7deb43e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55de52c10a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #23: + 0x150866 (0x55de52c23866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55de52c0c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: frame #60: PyObject_Call + 0xbc (0x55597eb72f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55597eb592b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x560de9b312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559bbf40df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd764443978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #12: + 0x5adc309 (0x7fd79cc02309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55de52c17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank44]: frame #62: + 0x150582 (0x55597eb72582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #54: + 0x211239 (0x56355f779239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555f7debfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #26: PyObject_Call + 0xbc (0x55de52c23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55de52c0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #63: PyObject_Call + 0xbc (0x55597eb72f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563e8797ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x558643f52007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55de52c17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank58]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555f7deafc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #13: + 0x5ae6f10 (0x7fd79cc0cf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #55: PyObject_Call + 0x207 (0x56355f6b9067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56355f69f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #57: + 0x150582 (0x56355f6b8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56355f69d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #59: + 0x150582 (0x56355f6b8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #60: PyObject_Call + 0xbc (0x56355f6b8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559bbf41fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55de52c088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56355f69f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #62: + 0x150582 (0x56355f6b8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #63: PyObject_Call + 0xbc (0x56355f6b8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #38: + 0x211239 (0x559bbf4e2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #30: + 0x150582 (0x55de52c23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563e8796ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #14: + 0x5ae6fa5 (0x7fd79cc0cfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #15: + 0x5124446 (0x7fd79c24a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555f7debfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #16: + 0x1acf4b8 (0x7fd798bf54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55de52c088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank60]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563e8797ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #17: + 0x5aee004 (0x7fd79cc14004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563e8796f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #32: + 0x150582 (0x55de52c23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #18: + 0x5af36b5 (0x7fd79cc196b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: trainer.train(dataloader) -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559bbf40ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55de52c088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555f7deb08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #19: + 0xd2631e (0x7fd7af80331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #34: + 0x150582 (0x55de52c23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #20: + 0x47def4 (0x7fd7aef5aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #45: + 0x150582 (0x555f7decb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #51: _PyFunction_Vectorcall + 0x6c (0x560de9b3ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55de52c088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #45: + 0x150582 (0x563e8798a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55de52c0ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #21: + 0x1445a6 (0x55bc06dd05a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #46: PyObject_Call + 0xbc (0x563e8798af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559bbf40a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #53: _PyObject_Call_Prepend + 0x69 (0x558643f63c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #54: + 0x211239 (0x558644026239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55de52c21c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default4]:[rank60]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563e879712b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55bc06dc9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #23: + 0x150866 (0x55bc06ddc866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55bc06dc5142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559bbf415a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55bc06dd0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x560de9b37007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #26: PyObject_Call + 0xbc (0x55bc06ddcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc06dc32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55bc06dd0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: frame #46: PyObject_Call + 0xbc (0x555f7decbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #48: + 0x150582 (0x563e8798a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #55: PyObject_Call + 0x207 (0x558643f66067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #38: + 0x211239 (0x55de52ce4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f505450d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x558643f4c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55bc06dc18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #30: + 0x150582 (0x55bc06ddc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555f7deb22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #48: + 0x150582 (0x555f7decb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55bc06dc18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #32: + 0x150582 (0x55bc06ddc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #53: _PyObject_Call_Prepend + 0x69 (0x560de9b48c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #57: + 0x150582 (0x558643f65582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55bc06dc18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #34: + 0x150582 (0x55bc06ddc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55bc06dc18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #49: PyObject_Call + 0xbc (0x555f7decbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55de52c10a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #54: + 0x211239 (0x560de9c0b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559bbf405c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x558643f4a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #49: PyObject_Call + 0xbc (0x563e8798af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55bc06dc8f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55bc06ddac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #55: PyObject_Call + 0x207 (0x560de9b4b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #59: + 0x150582 (0x558643f65582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #38: + 0x211239 (0x55bc06e9d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55bc06dc9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55bc06dc53e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #60: PyObject_Call + 0xbc (0x558643f65f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555f7deb22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55bc06dd0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555f7debfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55bc06dc0c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55bc06dd0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x558643f4c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55bc06dc18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55de52c0c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555f7deb8007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555f7dec9c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #45: + 0x150582 (0x55bc06ddc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #62: + 0x150582 (0x558643f65582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55de52c17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563e879712b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #46: PyObject_Call + 0xbc (0x55bc06ddcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #1: + 0x5b3a23e (0x7f508e02a23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x560de9b312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #57: + 0x150582 (0x560de9b4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x560de9b2f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55de52c07c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc06dc32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: frame #54: + 0x211239 (0x555f7df8c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563e8797ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55de52c17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55de52c088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default3]:[rank35]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f508e024c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #59: + 0x150582 (0x560de9b4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #45: + 0x150582 (0x55de52c23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #46: PyObject_Call + 0xbc (0x55de52c23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f508e024f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #55: PyObject_Call + 0x207 (0x555f7decc067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563e87977007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #48: + 0x150582 (0x55bc06ddc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #63: PyObject_Call + 0xbc (0x558643f65f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #49: PyObject_Call + 0xbc (0x55bc06ddcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc06dc32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555f7deb22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563e87988c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55de52c0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default5]:[rank61]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559bbf415a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559bbf4068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #48: + 0x150582 (0x55de52c23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #49: PyObject_Call + 0xbc (0x55de52c23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55bc06dd0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f508e025fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f508dfda371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #57: + 0x150582 (0x555f7decb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55bc06dc9007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55de52c0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: sharded_logits = self.model( -[default2]:[rank58]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555f7deb08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55bc06ddac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: frame #59: + 0x150582 (0x555f7decb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #60: PyObject_Call + 0xbc (0x560de9b4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x560de9b312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #54: + 0x211239 (0x55bc06e9d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: frame #45: + 0x150582 (0x559bbf421582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #46: PyObject_Call + 0xbc (0x559bbf421f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55de52c17a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: frame #54: + 0x211239 (0x563e87a4b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55de52c10007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55de52c21c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #54: + 0x211239 (0x55de52ce4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #60: PyObject_Call + 0xbc (0x555f7decbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #55: PyObject_Call + 0x207 (0x55de52c24067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #55: PyObject_Call + 0x207 (0x55bc06ddd067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #62: + 0x150582 (0x560de9b4a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55de52c0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #63: PyObject_Call + 0xbc (0x560de9b4af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc06dc32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank48]: frame #57: + 0x150582 (0x55de52c23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #55: PyObject_Call + 0x207 (0x563e8798b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55de52c088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #59: + 0x150582 (0x55de52c23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f508dfda371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559bbf4082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #60: PyObject_Call + 0xbc (0x55de52c23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #57: + 0x150582 (0x55bc06ddc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f508dfda371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555f7deb22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55bc06dc18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55de52c0a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #62: + 0x150582 (0x55de52c23582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563e879712b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #57: + 0x150582 (0x563e8798a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563e8796f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #63: PyObject_Call + 0xbc (0x55de52c23f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #59: + 0x150582 (0x55bc06ddc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: frame #59: + 0x150582 (0x563e8798a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #60: PyObject_Call + 0xbc (0x55bc06ddcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: output = model(**micro_batch) -[default5]:[rank61]: frame #48: + 0x150582 (0x559bbf421582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: frame #60: PyObject_Call + 0xbc (0x563e8798af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc06dc32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #62: + 0x150582 (0x55bc06ddc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563e879712b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #62: + 0x150582 (0x555f7decb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #49: PyObject_Call + 0xbc (0x559bbf421f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #63: PyObject_Call + 0xbc (0x55bc06ddcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: frame #62: + 0x150582 (0x563e8798a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #63: PyObject_Call + 0xbc (0x563e8798af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default3]:[rank35]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f508dfda371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: frame #63: PyObject_Call + 0xbc (0x555f7decbf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.p[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: . This may indicate a possible application crash on rank 0 or a network set up issue. -y", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559bbf4082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559bbf415a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank52]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank35]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f50557e7189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f50557ee610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1bb39d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: frame #1: + 0x5b3a23e (0x7f1bed4f423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559bbf40e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559bbf41fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #54: + 0x211239 (0x559bbf4e2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #55: PyObject_Call + 0x207 (0x559bbf422067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559bbf4082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f1bed4eec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1bed4eef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: frame #57: + 0x150582 (0x559bbf421582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559bbf4068fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #59: + 0x150582 (0x559bbf421582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1bed4effd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1bed4a4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1bed4a4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: frame #60: PyObject_Call + 0xbc (0x559bbf421f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559bbf4082b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #62: + 0x150582 (0x559bbf421582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #63: PyObject_Call + 0xbc (0x559bbf421f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1bed4a4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1bed4a4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f1bb4cb1189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f1bb4cb8610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f1bb4cd7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #12: + 0x5adc309 (0x7f1bed496309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f505580d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #12: + 0x5adc309 (0x7f508dfcc309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #13: + 0x5ae6f10 (0x7f1bed4a0f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #13: + 0x5ae6f10 (0x7f508dfd6f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #14: + 0x5ae6fa5 (0x7f1bed4a0fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #15: + 0x5124446 (0x7f1becade446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #16: + 0x1acf4b8 (0x7f1be94894b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #17: + 0x5aee004 (0x7f1bed4a8004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #18: + 0x5af36b5 (0x7f1bed4ad6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #19: + 0xd2631e (0x7f1c00097[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #20: + 0x47def4 (0x7f1bff7eeef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #21: + 0x1445a6 (0x55e9752925a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55e97528ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #23: + 0x150866 (0x55e97529e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55e975287142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55e975292a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #26: PyObject_Call + 0xbc (0x55e97529ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55e9752852b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55e975292a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55e9752838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #30: + 0x150582 (0x55e97529e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55e9752838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #32: + 0x150582 (0x55e97529e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55e9752838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #34: + 0x150582 (0x55e97529e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55e9752838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55e97528af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55e97529cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #38: + 0x211239 (0x55e97535f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55e97528ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55e9752873e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55e975292a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55e975282c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55e975292a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55e9752838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #45: + 0x150582 (0x55e97529e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #46: PyObject_Call + 0xbc (0x55e97529ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55e9752852b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #48: + 0x150582 (0x55e97529e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #49: PyObject_Call + 0xbc (0x55e97529ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55e9752852b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55e975292a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55e97528b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55e97529cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #54: + 0x211239 (0x55e97535f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: frame #55: PyObject_Call + 0x207 (0x55e97529f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55e9752852b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #57: + 0x150582 (0x55e97529e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default3]:[rank35]: frame #14: + 0x5ae6fa5 (0x7f508dfd6fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55e9752838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #59: + 0x150582 (0x55e97529e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #60: PyObject_Call + 0xbc (0x55e97529ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55e9752852b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #15: + 0x5124446 (0x7f508d614446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #62: + 0x150582 (0x55e97529e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #63: PyObject_Call + 0xbc (0x55e97529ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: frame #16: + 0x1acf4b8 (0x7f5089fbf4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default3]:[rank35]: frame #17: + 0x5aee004 (0x7f508dfde004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[defaul[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -t6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: frame #18: + 0x5af36b5 (0x7f508dfe36b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank54]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank54]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f708f302897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10[default5]:[rank37]: pipeline_state.run_communication() -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: frame #19: + 0xd2631e (0x7f50a0bcd31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: frame #20: + 0x47def4 (0x7f50a0324ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: frame #1: + 0x5b3a23e (0x7f70c8e1f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f70c8e19c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f70c8e19f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f70c8e1afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f70c8dcf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f70c8dcf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f70c8dcf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f70c8dcf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f70905dc189 in /fsx/ferdinandmom/miniforge3/envs/env-bench[default3]:[rank35]: frame #21: + 0x1445a6 (0x557cd86215a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] --cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557cd861aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f70905e3610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f7090602978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #12: + 0x5adc309 (0x7f70c8dc1309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #13: + 0x5ae6f10 (0x7f70c8dcbf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #23: + 0x150866 (0x557cd862d866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #14: + 0x5ae6fa5 (0x7f70c8dcbfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #15: + 0x5124446 (0x7f70c8409446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #16: + 0x1acf4b8 (0x7f70c4db44b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #17: + 0x5aee004 (0x7f70c8dd3004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #18: + 0x5af36b5 (0x7f70c8dd86b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #19: + 0xd2631e (0x7f70db9c2[default5]:[rank37]: recv_activation_tensor = recv_activation() -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #20: + 0x47def4 (0x7f70db119ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #21: + 0x1445a6 (0x56521c4985a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56521c491a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #23: + 0x150866 (0x56521c4a4866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56521c48d142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56521c498a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #26: PyObject_Call + 0xbc (0x56521c4a4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56521c48b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56521c498a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56521c4898fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #30: + 0x150582 (0x56521c4a4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56521c4898fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #32: + 0x150582 (0x56521c4a4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56521c4898fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #34: + 0x150582 (0x56521c4a4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56521c4898fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56521c490f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56521c4a2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #38: + 0x211239 (0x56521c565239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56521c491a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56521c48d3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56521c498a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56521c488c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557cd8616142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557cd8621a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56521c498a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56521c4898fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #45: + 0x150582 (0x56521c4a4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #46: PyObject_Call + 0xbc (0x56521c4a4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56521c48b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: frame #48: + 0x150582 (0x56521c4a4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #49: PyObject_Call + 0xbc (0x56521c4a4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56521c48b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56521c498a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56521c491007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56521c4a2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #54: + 0x211239 (0x56521c565239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #55: PyObject_Call + 0x207 (0x56521c4a5067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56521c48b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #57: + 0x150582 (0x56521c4a4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: frame #26: PyObject_Call + 0xbc (0x557cd862df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56521c4898fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #59: + 0x150582 (0x56521c4a4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #60: PyObject_Call + 0xbc (0x56521c4a4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56521c48b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x557cd86142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: frame #62: + 0x150582 (0x56521c4a4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #63: PyObject_Call + 0xbc (0x56521c4a4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank35]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557cd8621a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x557cd86128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/benc[default3]:[rank35]: frame #30: + 0x150582 (0x557cd862d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -h_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in[default3]:[rank35]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x557cd86128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) - recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[d[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -efault2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank50]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank50]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2441cde897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank50]: frame #1: + 0x5b3a23e (0x7f247b7fb23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f247b7f5c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f247b7f5f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f247b7f6fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f247b7ab371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f247b7ab371 in /fsx/ferdinandmom/miniforge3/envs/[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f247b7ab371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank50]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f247b7ab371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2442fb8189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2442fbf610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2442fde978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #12: <[default5]:[rank37]: dist.recv( -unknown function> + 0x5adc309 (0x7f247b79d309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #13: + 0x5ae6f10 (0x7f247b7a7f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default2]:[rank50]: frame #14: + 0x5ae6fa5 (0x7f247b7a7fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #15: + 0x5124446 (0x7f247ade5446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: frame #16: + 0x1acf4b8 (0x7f24777904b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #17: + 0x5aee004 (0x7f247b7af004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #18: + 0x5af36b5 (0x7f247b7b46b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #19: + 0xd2631e (0x7f248e39e31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: return func(*args, **kwargs) -[default2]:[rank50]: frame #20: + 0x47def4 (0x7f248daf5ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #21: + 0x1445a6 (0x563a6b2885a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563a6b281a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: dist.recv( -[default2]:[rank50]: frame #23: + 0x150866 (0x563a6b294866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563a6b27d142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563a6b288a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: frame #26: PyObject_Call + 0xbc (0x563a6b294f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563a6b27b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563a6b288a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563a6b2798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #30: + 0x150582 (0x563a6b294582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563a6b2798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #32: + 0x150582 (0x563a6b294582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cl[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -uster/bin/python3.10) -[default2]:[rank50]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563a6b2798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #34: + 0x150582 (0x563a6b294582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563a6b2798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563a6b280f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563a6b292c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #38: + 0x211239 (0x563a6b355239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563a6b281a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563a6b27d3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563a6b288a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563a6b278c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563a6b288a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563a6b2798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #45: + 0x150582 (0x563a6b294582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #46: PyObject_Call + 0xbc (0x563a6b294f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563a6b27b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #48: + 0x150582 (0x563a6b294582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #49: PyObject_Call + 0xbc (0x563a6b294f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563a6b27b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563a6b288a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: pipeline_state.run_communication() -[default3]:[rank35]: frame #32: + 0x150582 (0x557cd862d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x557cd86128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563a6b281007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563a6b292c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #54: + 0x211239 (0x563a6b355239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #55: PyObject_Call + 0x207 (0x563a6b295067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank50]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563a6b27b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #57: + 0x150582 (0x563a6b294582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563a6b2798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #34: + 0x150582 (0x557cd862d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #59: + 0x150582 (0x563a6b294582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #60: PyObject_Call + 0xbc (0x563a6b294f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563a6b27b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #62: + 0x150582 (0x563a6b294582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x557cd86128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #63: PyObject_Call + 0xbc (0x563a6b294f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default3]:[rank35]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557cd8619f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557cd862bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: Traceback (most recent call last): -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: frame #38: + 0x211239 (0x557cd86ee239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557cd861aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default3]:[rank35]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x557cd86163e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557cd8621a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2dc6d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: frame #1: + 0x5b3a23e (0x7ff3161ef23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff3161e9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff3161e9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557cd8611c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557cd8621a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x557cd86128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank37]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: frame #45: + 0x150582 (0x557cd862d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default3]:[rank35]: frame #46: PyObject_Call + 0xbc (0x557cd862df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff3161eafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2177ea2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default3]:[rank35]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x557cd86142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: frame #48: + 0x150582 (0x557cd862d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #49: PyObject_Call + 0xbc (0x557cd862df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x557cd86142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557cd8621a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff31619f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff31619f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff31619f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557cd861a007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557cd862bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff31619f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff2dd9ac189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: output = model(**micro_batch) -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: frame #54: + 0x211239 (0x557cd86ee239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #55: PyObject_Call + 0x207 (0x557cd862e067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x557cd86142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: frame #57: + 0x150582 (0x557cd862d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x557cd86128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff2dd9b3610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff2dd9d2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: frame #59: + 0x150582 (0x557cd862d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: frame #60: PyObject_Call + 0xbc (0x557cd862df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x557cd86142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: frame #62: + 0x150582 (0x557cd862d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #63: PyObject_Call + 0xbc (0x557cd862df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: frame #1: + 0x5b3a23e (0x7f21b19bf23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank45]: sharded_logits = self.model( -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: frame #12: + 0x5adc309 (0x7ff316191309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #13: + 0x5ae6f10 (0x7ff31619bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: output = model(**micro_batch) -[default5]:[rank37]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f21b19b9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default5]:[rank37]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f21b19b9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: frame #14: + 0x5ae6fa5 (0x7ff31619bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f21b19bafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f21b196f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #15: + 0x5124446 (0x7ff3157d9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #16: + 0x1acf4b8 (0x7ff3121844b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faea62e5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank39]: frame #1: + 0x5b3a23e (0x7faedfe0223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #17: + 0x5aee004 (0x7ff3161a3004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #18: + 0x5af36b5 (0x7ff3161a86b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #19: + 0xd2631e (0x7ff328d9231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default7]:[rank39]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7faedfdfcc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f21b196f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f21b196f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7faedfdfcf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7faedfdfdfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faedfdb2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faedfdb2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faedfdb2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faedfdb2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7faea75bf189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #20: + 0x47def4 (0x7ff3284e9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: frame #21: + 0x1445a6 (0x55aa4b8ec5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55aa4b8e5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7faea75c6610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f21b196f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7faea75e5978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #12: + 0x5adc309 (0x7faedfda4309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: trainer.train(dataloader) -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: frame #23: + 0x150866 (0x55aa4b8f8866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55aa4b8e1142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f217917c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2179183610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f21791a2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #12: + 0x5adc309 (0x7f21b1961309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: frame #13: + 0x5ae6f10 (0x7faedfdaef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55aa4b8eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank36]: frame #26: PyObject_Call + 0xbc (0x55aa4b8f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: frame #13: + 0x5ae6f10 (0x7f21b196bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default4]:[rank36]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55aa4b8df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55aa4b8eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #14: + 0x5ae6fa5 (0x7faedfdaefa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: frame #15: + 0x5124446 (0x7faedf3ec446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55aa4b8dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: frame #30: + 0x150582 (0x55aa4b8f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55aa4b8dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #16: + 0x1acf4b8 (0x7faedbd974b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: output = model(**micro_batch) -[default2]:[rank42]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank36]: frame #32: + 0x150582 (0x55aa4b8f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #17: + 0x5aee004 (0x7faedfdb6004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #18: + 0x5af36b5 (0x7faedfdbb6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: frame #14: + 0x5ae6fa5 (0x7f21b196bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: frame #15: + 0x5124446 (0x7f21b0fa9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #19: + 0xd2631e (0x7faef29a531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #20: + 0x47def4 (0x7faef20fcef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: pipeline_state.run_communication() -[default5]:[rank37]: frame #16: + 0x1acf4b8 (0x7f21ad9544b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba2c80e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: frame #17: + 0x5aee004 (0x7f21b1973004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: frame #1: + 0x5b3a23e (0x7fba6632b23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55aa4b8dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fba66325c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #34: + 0x150582 (0x55aa4b8f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: dist.recv( -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fba66325f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #21: + 0x1445a6 (0x55813073e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #22: _PyObject_MakeTpCall + 0x26b (0x558130737a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank46]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fba66326fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #18: + 0x5af36b5 (0x7f21b19786b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fba662db371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fba662db371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fba662db371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank36]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55aa4b8dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55aa4b8e4f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fba662db371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fba2dae8189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: sharded_logits = self.model( -[default2]:[rank42]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fba2daef610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55aa4b8f6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: frame #19: + 0xd2631e (0x7f21c456231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #20: + 0x47def4 (0x7f21c3cb9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fba2db0e978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: frame #23: + 0x150866 (0x55813074a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x558130733142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank42]: frame #12: + 0x5adc309 (0x7fba662cd309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #13: + 0x5ae6f10 (0x7fba662d7f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: sharded_logits = self.model( -[default6]:[rank46]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55813073ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #26: PyObject_Call + 0xbc (0x55813074af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #14: + 0x5ae6fa5 (0x7fba662d7fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #15: + 0x5124446 (0x7fba65915446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank45]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #21: + 0x1445a6 (0x55aad32e85a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55aad32e1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #16: + 0x1acf4b8 (0x7fba622c04b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #23: + 0x150866 (0x55aad32f4866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84b9ed6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #17: + 0x5aee004 (0x7fba662df004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #18: + 0x5af36b5 (0x7fba662e46b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d29567897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]: frame #1: + 0x5b3a23e (0x7f84f39f323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #19: + 0xd2631e (0x7fba78ece31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #20: + 0x47def4 (0x7fba78625ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #1: + 0x5b3a23e (0x7f9d6308423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f9d6307ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f84f39edc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default7]:[rank39]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5581307312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55813073ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f9d6307ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55aad32dd142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f84f39edf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55aad32e8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #21: + 0x1445a6 (0x5626f68d45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #26: PyObject_Call + 0xbc (0x55aad32f4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55aad32db2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f9d6307ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55aad32e8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5626f68cda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #23: + 0x150866 (0x5626f68e0866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55aad32d98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f84f39eefd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9d63034371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5626f68c9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #30: + 0x150582 (0x55aad32f4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84f39a3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84f39a3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84f39a3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55aad32d98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84f39a3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #32: + 0x150582 (0x55aad32f4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5626f68d4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #26: PyObject_Call + 0xbc (0x5626f68e0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55aad32d98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #34: + 0x150582 (0x55aad32f4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f84bb1b0189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9d63034371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55aad32d98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f84bb1b7610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55aad32e0f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9d63034371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55aad32f2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9d63034371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5626f68c72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5626f68d4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #38: + 0x211239 (0x55aad33b5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55aad32e1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f84bb1d6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #38: + 0x211239 (0x55aa4b9b9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55aa4b8e5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f9d2a841189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55813072f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #30: + 0x150582 (0x55813074a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #12: + 0x5adc309 (0x7f84f3995309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #13: + 0x5ae6f10 (0x7f84f399ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55813072f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f9d2a848610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #32: + 0x150582 (0x55813074a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5626f68c58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55813072f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #34: + 0x150582 (0x55813074a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #30: + 0x150582 (0x5626f68e0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55aad32dd3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #14: + 0x5ae6fa5 (0x7f84f399ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55aad32e8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55aa4b8e13e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55aa4b8eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #15: + 0x5124446 (0x7f84f2fdd446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55813072f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55aad32d8c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5626f68c58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f9d2a867978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #12: + 0x5adc309 (0x7f9d63026309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #32: + 0x150582 (0x5626f68e0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5626f68c58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #16: + 0x1acf4b8 (0x7f84ef9884b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #17: + 0x5aee004 (0x7f84f39a7004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #18: + 0x5af36b5 (0x7f84f39ac6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55aa4b8dcc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55aa4b8eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #19: + 0xd2631e (0x7f850659631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #20: + 0x47def4 (0x7f8505cedef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55aa4b8dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55aad32e8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55aad32d98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #13: + 0x5ae6f10 (0x7f9d63030f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #45: + 0x150582 (0x55aad32f4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: frame #21: + 0x1445a6 (0x55f0c05f25a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55f0c05eba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default6]:[rank46]: frame #14: + 0x5ae6fa5 (0x7f9d63030fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x558130736f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #37: _PyObject_Call_Prepend + 0x69 (0x558130748c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #38: + 0x211239 (0x55813080b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #23: + 0x150866 (0x55f0c05fe866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55f0c05e7142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: frame #34: + 0x150582 (0x5626f68e0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5626f68c58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: frame #15: + 0x5124446 (0x7f9d6266e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #16: + 0x1acf4b8 (0x7f9d5f0194b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #17: + 0x5aee004 (0x7f9d63038004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #46: PyObject_Call + 0xbc (0x55aad32f4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5626f68ccf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5626f68dec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: frame #18: + 0x5af36b5 (0x7f9d6303d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55aad32db2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #19: + 0xd2631e (0x7f9d75c2731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: frame #38: + 0x211239 (0x5626f69a1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #48: + 0x150582 (0x55aad32f4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #39: _PyObject_MakeTpCall + 0x26b (0x558130737a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5581307333e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55f0c05f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #49: PyObject_Call + 0xbc (0x55aad32f4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: frame #26: PyObject_Call + 0xbc (0x55f0c05fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #20: + 0x47def4 (0x7f9d7537eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55f0c05e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55f0c05f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5626f68cda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #45: + 0x150582 (0x55aa4b8f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5626f68c93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #46: PyObject_Call + 0xbc (0x55aa4b8f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55aad32db2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #21: + 0x1445a6 (0x5555e16565a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55813073ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55813072ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5626f68d4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55aad32e8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55aad32e1007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55f0c05e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #30: + 0x150582 (0x55f0c05fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55f0c05e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55813073ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55813072f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #32: + 0x150582 (0x55f0c05fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #45: + 0x150582 (0x55813074a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5626f68c4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5626f68d4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #46: PyObject_Call + 0xbc (0x55813074af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55f0c05e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5555e164fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #23: + 0x150866 (0x5555e1662866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55aad32f2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #54: + 0x211239 (0x55aad33b5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5626f68c58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #45: + 0x150582 (0x5626f68e0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55aa4b8df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5555e164b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5555e1656a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: frame #46: PyObject_Call + 0xbc (0x5626f68e0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5626f68c72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #26: PyObject_Call + 0xbc (0x5555e1662f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #34: + 0x150582 (0x55f0c05fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55f0c05e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #48: + 0x150582 (0x5626f68e0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55f0c05eaf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #49: PyObject_Call + 0xbc (0x5626f68e0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5626f68c72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5626f68d4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55f0c05fcc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5555e16492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5555e1656a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #48: + 0x150582 (0x55aa4b8f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #55: PyObject_Call + 0x207 (0x55aad32f5067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55aad32db2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5626f68cd007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5555e16478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #49: PyObject_Call + 0xbc (0x55aa4b8f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #38: + 0x211239 (0x55f0c06bf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55f0c05eba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5626f68dec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: frame #30: + 0x150582 (0x5555e1662582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5581307312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #48: + 0x150582 (0x55813074a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5555e16478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #54: + 0x211239 (0x5626f69a1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #55: PyObject_Call + 0x207 (0x5626f68e1067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default5]:[rank45]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55f0c05e73e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55f0c05f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: frame #49: PyObject_Call + 0xbc (0x55813074af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5581307312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #32: + 0x150582 (0x5555e1662582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55aa4b8df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55f0c05e2c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5555e16478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #34: + 0x150582 (0x5555e1662582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55f0c05f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55813073ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x558130737007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5626f68c72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #57: + 0x150582 (0x5626f68e0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #53: _PyObject_Call_Prepend + 0x69 (0x558130748c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5626f68c58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank46]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5555e16478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: frame #59: + 0x150582 (0x5626f68e0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: frame #54: + 0x211239 (0x55813080b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55aa4b8eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default7]:[rank39]: frame #55: PyObject_Call + 0x207 (0x55813074b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5581307312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55f0c05e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #45: + 0x150582 (0x55f0c05fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5555e164ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #60: PyObject_Call + 0xbc (0x5626f68e0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5626f68c72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #46: PyObject_Call + 0xbc (0x55f0c05fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55f0c05e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5555e1660c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #62: + 0x150582 (0x5626f68e0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #48: + 0x150582 (0x55f0c05fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #49: PyObject_Call + 0xbc (0x55f0c05fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: frame #63: PyObject_Call + 0xbc (0x5626f68e0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55aa4b8e5007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55f0c05e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55aa4b8f6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #57: + 0x150582 (0x55813074a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55813072f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55f0c05f2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55f0c05eb007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #38: + 0x211239 (0x5555e1723239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5555e164fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5555e164b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #59: + 0x150582 (0x55813074a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #60: PyObject_Call + 0xbc (0x55813074af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55f0c05fcc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: frame #54: + 0x211239 (0x55f0c06bf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: frame #55: PyObject_Call + 0x207 (0x55f0c05ff067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55f0c05e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: frame #54: + 0x211239 (0x55aa4b9b9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #55: PyObject_Call + 0x207 (0x55aa4b8f9067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #57: + 0x150582 (0x55f0c05fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55f0c05e38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: frame #59: + 0x150582 (0x55f0c05fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55aa4b8df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5555e1656a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #57: + 0x150582 (0x55aa4b8f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #60: PyObject_Call + 0xbc (0x55f0c05fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #57: + 0x150582 (0x55aad32f4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55aad32d98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5555e1646c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5581307312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55f0c05e52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #62: + 0x150582 (0x55813074a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55aa4b8dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #62: + 0x150582 (0x55f0c05fe582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #63: PyObject_Call + 0xbc (0x55f0c05fef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #59: + 0x150582 (0x55aa4b8f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5555e1656a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5555e16478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #63: PyObject_Call + 0xbc (0x55813074af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank46]: frame #45: + 0x150582 (0x5555e1662582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank46]: frame #46: PyObject_Call + 0xbc (0x5555e1662f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5555e16492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #48: + 0x150582 (0x5555e1662582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #49: PyObject_Call + 0xbc (0x5555e1662f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5555e16492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5555e1656a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5555e164f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5555e1660c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #54: + 0x211239 (0x5555e1723239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #60: PyObject_Call + 0xbc (0x55aa4b8f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #55: PyObject_Call + 0x207 (0x5555e1663067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5555e16492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #57: + 0x150582 (0x5555e1662582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5555e16478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: frame #59: + 0x150582 (0x5555e1662582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #60: PyObject_Call + 0xbc (0x5555e1662f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5555e16492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank46]: frame #62: + 0x150582 (0x5555e1662582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #63: PyObject_Call + 0xbc (0x5555e1662f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55aa4b8df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #62: + 0x150582 (0x55aa4b8f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: frame #63: PyObject_Call + 0xbc (0x55aa4b8f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: return func(*args, **kwargs) -[default5]:[rank37]: frame #59: + 0x150582 (0x55aad32f4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #60: PyObject_Call + 0xbc (0x55aad32f4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55aad32db2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank32]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2cf454e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank32]: frame #1: + 0x5b3a23e (0x7f2d2e06b23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank38]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa10ce76897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank37]: frame #62: + 0x150582 (0x55aad32f4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #63: PyObject_Call + 0xbc (0x55aad32f4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #1: + 0x5b3a23e (0x7fa14699323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa14698dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa14698df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank38]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa14698efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa146943371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa146943371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa146943371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa146943371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2d2e065c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa10e150189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2d2e065f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa10e157610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa10e176978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f2d2e066fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #12: + 0x5adc309 (0x7fa146935309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #13: + 0x5ae6f10 (0x7fa14693ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2d2e01b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2d2e01b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #14: + 0x5ae6fa5 (0x7fa14693ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #15: + 0x5124446 (0x7fa145f7d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #16: + 0x1acf4b8 (0x7fa1429284b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #17: + 0x5aee004 (0x7fa146947004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #18: + 0x5af36b5 (0x7fa14694c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2d2e01b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2d2e01b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2cf5828189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2cf582f610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2cf584e978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #12: + 0x5adc309 (0x7f2d2e00d309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #19: + 0xd2631e (0x7fa15953631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #20: + 0x47def4 (0x7fa158c8def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #13: + 0x5ae6f10 (0x7f2d2e017f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #21: + 0x1445a6 (0x562f4fb135a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562f4fb0ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #14: + 0x5ae6fa5 (0x7f2d2e017fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #23: + 0x150866 (0x562f4fb1f866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x562f4fb08142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562f4fb13a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #26: PyObject_Call + 0xbc (0x562f4fb1ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #15: + 0x5124446 (0x7f2d2d655446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x562f4fb062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562f4fb13a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x562f4fb048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #30: + 0x150582 (0x562f4fb1f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x562f4fb048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #16: + 0x1acf4b8 (0x7f2d2a0004b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #17: + 0x5aee004 (0x7f2d2e01f004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #18: + 0x5af36b5 (0x7f2d2e0246b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #19: + 0xd2631e (0x7f2d40c0e31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #20: + 0x47def4 (0x7f2d40365ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #32: + 0x150582 (0x562f4fb1f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x562f4fb048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #34: + 0x150582 (0x562f4fb1f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x562f4fb048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562f4fb0bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #21: + 0x1445a6 (0x559f706095a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562f4fb1dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559f70602a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #38: + 0x211239 (0x562f4fbe0239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562f4fb0ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x562f4fb083e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562f4fb13a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562f4fb03c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562f4fb13a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x562f4fb048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #45: + 0x150582 (0x562f4fb1f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #46: PyObject_Call + 0xbc (0x562f4fb1ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x562f4fb062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #23: + 0x150866 (0x559f70615866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #48: + 0x150582 (0x562f4fb1f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559f705fe142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #49: PyObject_Call + 0xbc (0x562f4fb1ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x562f4fb062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562f4fb13a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562f4fb0c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559f70609a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562f4fb1dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #26: PyObject_Call + 0xbc (0x559f70615f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #54: + 0x211239 (0x562f4fbe0239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559f705fc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559f70609a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559f705fa8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #55: PyObject_Call + 0x207 (0x562f4fb20067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #30: + 0x150582 (0x559f70615582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559f705fa8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x562f4fb062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #32: + 0x150582 (0x559f70615582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #57: + 0x150582 (0x562f4fb1f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x562f4fb048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #59: + 0x150582 (0x562f4fb1f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #60: PyObject_Call + 0xbc (0x562f4fb1ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559f705fa8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x562f4fb062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #34: + 0x150582 (0x559f70615582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #62: + 0x150582 (0x562f4fb1f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559f705fa8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #63: PyObject_Call + 0xbc (0x562f4fb1ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559f70601f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank32]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559f70613c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #38: + 0x211239 (0x559f706d6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559f70602a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559f705fe3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559f70609a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559f705f9c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559f70609a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559f705fa8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #45: + 0x150582 (0x559f70615582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #46: PyObject_Call + 0xbc (0x559f70615f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559f705fc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #48: + 0x150582 (0x559f70615582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #49: PyObject_Call + 0xbc (0x559f70615f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559f705fc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559f70609a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559f70602007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559f70613c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #54: + 0x211239 (0x559f706d6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #55: PyObject_Call + 0x207 (0x559f70616067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559f705fc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #57: + 0x150582 (0x559f70615582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559f705fa8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #59: + 0x150582 (0x559f70615582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #60: PyObject_Call + 0xbc (0x559f70615f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559f705fc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #62: + 0x150582 (0x559f70615582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #63: PyObject_Call + 0xbc (0x559f70615f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank34]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f660d6fc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank34]: frame #1: + 0x5b3a23e (0x7f664721923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6647213c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6647213f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6647214fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f66471c9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f66471c9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f66471c9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f66471c9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f660e9d6189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f660e9dd610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f660e9fc978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #12: + 0x5adc309 (0x7f66471bb309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #13: + 0x5ae6f10 (0x7f66471c5f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #14: + 0x5ae6fa5 (0x7f66471c5fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #15: + 0x5124446 (0x7f6646803446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #16: + 0x1acf4b8 (0x7f66431ae4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #17: + 0x5aee004 (0x7f66471cd004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #18: + 0x5af36b5 (0x7f66471d26b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #19: + 0xd2631e (0x7f6659dbc31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #20: + 0x47def4 (0x7f6659513ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #21: + 0x1445a6 (0x5573e2fc15a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5573e2fbaa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #23: + 0x150866 (0x5573e2fcd866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5573e2fb6142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5573e2fc1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #26: PyObject_Call + 0xbc (0x5573e2fcdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5573e2fb42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5573e2fc1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5573e2fb28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #30: + 0x150582 (0x5573e2fcd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5573e2fb28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #32: + 0x150582 (0x5573e2fcd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5573e2fb28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #34: + 0x150582 (0x5573e2fcd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5573e2fb28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5573e2fb9f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5573e2fcbc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #38: + 0x211239 (0x5573e308e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5573e2fbaa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5573e2fb63e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5573e2fc1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5573e2fb1c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5573e2fc1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5573e2fb28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #45: + 0x150582 (0x5573e2fcd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #46: PyObject_Call + 0xbc (0x5573e2fcdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5573e2fb42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #48: + 0x150582 (0x5573e2fcd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #49: PyObject_Call + 0xbc (0x5573e2fcdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5573e2fb42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5573e2fc1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5573e2fba007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5573e2fcbc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #54: + 0x211239 (0x5573e308e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #55: PyObject_Call + 0x207 (0x5573e2fce067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5573e2fb42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #57: + 0x150582 (0x5573e2fcd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5573e2fb28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #59: + 0x150582 (0x5573e2fcd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #60: PyObject_Call + 0xbc (0x5573e2fcdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5573e2fb42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #62: + 0x150582 (0x5573e2fcd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #63: PyObject_Call + 0xbc (0x5573e2fcdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: . This may indicate a possible application crash on rank 0 or a network set up issue. -E0703 09:15:46.169000 139850729969472 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 72633) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:15:46 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 72634) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:15:46 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 72635) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:15:46 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 72636) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:15:46 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 72637) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:15:46 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 72638) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:15:46 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 72639) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:15:46 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 72640) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:15:46 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 72633) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -W0703 09:15:50.044000 140605683656448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_958072_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:50.389000 139791160530688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-138.ec2.internal_740311_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:50.834000 139863784224512 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-166-125.ec2.internal_201915_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:50.854000 139817071003392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-147.ec2.internal_855891_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:50.874000 139667081152256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_468465_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:50.894000 140021301856000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-103.ec2.internal_943239_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:51.003000 140542056736512 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-167-177.ec2.internal_838550_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:51.038000 140611344389952 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 958145 closing signal SIGTERM -W0703 09:15:51.039000 140611344389952 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 958146 closing signal SIGTERM -W0703 09:15:51.039000 140611344389952 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 958147 closing signal SIGTERM -W0703 09:15:51.040000 140611344389952 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 958148 closing signal SIGTERM -W0703 09:15:51.039000 140026962589504 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 943316 closing signal SIGTERM -W0703 09:15:51.040000 140611344389952 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 958149 closing signal SIGTERM -W0703 09:15:51.040000 140611344389952 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 958150 closing signal SIGTERM -W0703 09:15:51.041000 140611344389952 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 958151 closing signal SIGTERM -W0703 09:15:51.041000 140611344389952 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 958152 closing signal SIGTERM -W0703 09:15:51.049000 139672741885760 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 468538 closing signal SIGTERM -W0703 09:15:51.049000 139672741885760 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 468539 closing signal SIGTERM -W0703 09:15:51.049000 139672741885760 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 468540 closing signal SIGTERM -W0703 09:15:51.050000 139672741885760 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 468541 closing signal SIGTERM -W0703 09:15:51.052000 139672741885760 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 468542 closing signal SIGTERM -W0703 09:15:51.052000 139672741885760 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 468543 closing signal SIGTERM -W0703 09:15:51.052000 139672741885760 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 468544 closing signal SIGTERM -W0703 09:15:51.054000 139869444958016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 201989 closing signal SIGTERM -W0703 09:15:51.054000 139869444958016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 201990 closing signal SIGTERM -W0703 09:15:51.054000 139869444958016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 201991 closing signal SIGTERM -W0703 09:15:51.054000 139869444958016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 201992 closing signal SIGTERM -W0703 09:15:51.053000 139672741885760 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 468545 closing signal SIGTERM -W0703 09:15:51.055000 139869444958016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 201993 closing signal SIGTERM -W0703 09:15:51.056000 139869444958016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 201994 closing signal SIGTERM -W0703 09:15:51.056000 139869444958016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 201995 closing signal SIGTERM -W0703 09:15:51.056000 139869444958016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 201996 closing signal SIGTERM -E0703 09:15:51.075000 140547717470016 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 838624) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:15:51.081000 140547717470016 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-167-177.ec2.internal_838550_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:51.110000 140547717470016 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-167-177.ec2.internal_838550_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:51.139000 140547717470016 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-167-177.ec2.internal_838550_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:15:51 - host : ip-26-0-167-177.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 838625) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:15:51 - host : ip-26-0-167-177.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 838626) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:15:51 - host : ip-26-0-167-177.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 838627) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:15:51 - host : ip-26-0-167-177.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 838628) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:15:51 - host : ip-26-0-167-177.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 838629) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:15:51 - host : ip-26-0-167-177.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 838630) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:15:51 - host : ip-26-0-167-177.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 838631) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:15:51 - host : ip-26-0-167-177.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 838624) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:15:51.171000 139822731736896 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 855964) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:15:51.170000 139796821264192 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 740384) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:15:51.176000 139822731736896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_855891_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:51.176000 139796821264192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_740311_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:51.203000 139796821264192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_740311_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:51.207000 139822731736896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_855891_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:51.231000 139796821264192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_740311_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-138.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 740385) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-138.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 740386) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-138.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 740387) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-138.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 740388) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-138.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 740389) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-138.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 740390) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-138.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 740391) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-138.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 740384) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -W0703 09:15:51.239000 139822731736896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_855891_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:15:51 - host : ip-26-0-163-147.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 855965) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:15:51 - host : ip-26-0-163-147.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 855966) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:15:51 - host : ip-26-0-163-147.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 855967) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:15:51 - host : ip-26-0-163-147.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 855968) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:15:51 - host : ip-26-0-163-147.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 855969) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:15:51 - host : ip-26-0-163-147.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 855970) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:15:51 - host : ip-26-0-163-147.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 855971) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:15:51 - host : ip-26-0-163-147.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 855964) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:15:51.373000 140026962589504 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 943312) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:15:51.379000 140026962589504 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_943239_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-167-177: task 7: Exited with exit code 1 -W0703 09:15:51.406000 140026962589504 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_943239_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:51.434000 140026962589504 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_943239_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 943313) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 943314) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 943315) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 943317) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 943318) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 943319) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:15:51 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 943312) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-138: task 2: Exited with exit code 1 -srun: error: ip-26-0-163-147: task 3: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 1: Exited with exit code 1 -W0703 09:15:54.682000 139869444958016 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_201915_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:54.695000 139869444958016 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_201915_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-166-125: task 6: Exited with exit code 1 -W0703 09:15:55.049000 140605683656448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_958072_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:55.879000 139667081152256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_468465_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:55.983000 140611344389952 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_958072_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:55.994000 140611344389952 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_958072_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-165-24: task 5: Exited with exit code 1 -W0703 09:15:59.298000 139672741885760 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_468465_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:15:59.309000 139672741885760 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_468465_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-164-207: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-256/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/bench.slurm deleted file mode 100644 index 6c8ca4ad6277bdbf01566ad08fcbea33f7bb9d00..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/config.yaml deleted file mode 100644 index 356952f609aca41b325f1892af98b8cc406053d7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 32 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 32 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/log.out deleted file mode 100644 index 7babad6289130bd759080f0d052aea18541def1d..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/log.out +++ /dev/null @@ -1,4027 +0,0 @@ -======================== -START TIME: Wed Jul 3 04:07:43 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 04:07:46.423000 140515379451712 torch/distributed/run.py:757] -W0703 04:07:46.423000 140515379451712 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.423000 140515379451712 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:07:46.423000 140515379451712 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.427000 140126076323648 torch/distributed/run.py:757] -W0703 04:07:46.427000 140126076323648 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.427000 140126076323648 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:07:46.427000 140126076323648 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.425000 140704491784000 torch/distributed/run.py:757] -W0703 04:07:46.425000 140704491784000 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.425000 140704491784000 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:07:46.425000 140704491784000 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.434000 139909740459840 torch/distributed/run.py:757] -W0703 04:07:46.434000 139909740459840 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.434000 139909740459840 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:07:46.434000 139909740459840 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.477000 140428738991936 torch/distributed/run.py:757] -W0703 04:07:46.477000 140428738991936 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.477000 140428738991936 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:07:46.477000 140428738991936 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.538000 140430287660864 torch/distributed/run.py:757] -W0703 04:07:46.538000 140430287660864 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.538000 140430287660864 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:07:46.538000 140430287660864 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.561000 140275647977280 torch/distributed/run.py:757] -W0703 04:07:46.561000 140275647977280 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.561000 140275647977280 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:07:46.561000 140275647977280 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.562000 140712564258624 torch/distributed/run.py:757] -W0703 04:07:46.562000 140712564258624 torch/distributed/run.py:757] ***************************************** -W0703 04:07:46.562000 140712564258624 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:07:46.562000 140712564258624 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 04:08:07 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config: -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: run='%date_%jobid', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: step=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: consumed_train_samples=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: benchmark_csv_path=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ignore_sanity_checks=True), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp=2, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp=32, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp_engine=, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_mode=, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_linear_async_communication=False, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: expert_parallel_size=1), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dtype=torch.bfloat16, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_revision=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_max_length=None), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoint_interval=100000, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: save_initial_state=False, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: resume_checkpoint_path=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: log_level_replica='info', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration_step_info_interval=1), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: train_steps=20, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: micro_batch_size=32, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: batch_accumulation_per_replica=32, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: val_check_interval=-1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_val_batches=0, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_test_batches=0), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta1=0.9, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta2=0.95, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: torch_adam_is_fused=True, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: name='adamW'), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: zero_stage=1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: weight_decay=0.01, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: clip_grad=1.0, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_steps=1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_style='linear', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_style='linear', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_steps=19, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_starting_step=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: min_decay_lr=1e-05)), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: start_training_step=1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_splits='train', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_config_name=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_overwrite_cache=False, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: text_column_name='text'), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_loading_workers=0))], -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32')), -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lighteval=None) -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Model Config: -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272) -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Building model.. -[default0]:07/03/2024 04:08:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Setting PP block ranks... -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=24|ip-26-0-165-24]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=24|ip-26-0-165-24]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=24|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Parametrizing model parameters using StandardParametrizator -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=8|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=8|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=8|ip-26-0-173-202]: No checkpoint path provided. -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=24|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=24|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=24|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=28|ip-26-0-165-24]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=28|ip-26-0-165-24]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=28|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=9|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=9|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=30|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=30|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=9|ip-26-0-173-202]: No checkpoint path provided. -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=11|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=11|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=11|ip-26-0-173-202]: No checkpoint path provided. -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=30|ip-26-0-174-36]: No checkpoint path provided. -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=27|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=27|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=27|ip-26-0-174-36]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=26|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=26|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=28|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=28|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=28|ip-26-0-174-36]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=26|ip-26-0-174-36]: No checkpoint path provided. -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=12|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=12|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=22|ip-26-0-173-246]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=22|ip-26-0-173-246]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=22|ip-26-0-173-246]: No checkpoint path provided. -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=20|ip-26-0-173-246]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=20|ip-26-0-173-246]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=20|ip-26-0-173-246]: No checkpoint path provided. -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=12|ip-26-0-173-202]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=18|ip-26-0-173-246]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=18|ip-26-0-173-246]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=18|ip-26-0-173-246]: No checkpoint path provided. -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=15|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=15|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=15|ip-26-0-173-202]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=10|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=10|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=10|ip-26-0-173-202]: No checkpoint path provided. -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=14|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=14|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=14|ip-26-0-173-202]: No checkpoint path provided. -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=13|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=13|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=13|ip-26-0-173-202]: No checkpoint path provided. -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=27|ip-26-0-165-24]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=27|ip-26-0-165-24]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=27|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: No checkpoint path provided. -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=26|ip-26-0-165-24]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=26|ip-26-0-165-24]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=26|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=19|ip-26-0-173-246]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=19|ip-26-0-173-246]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=19|ip-26-0-173-246]: No checkpoint path provided. -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=25|ip-26-0-165-24]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=25|ip-26-0-165-24]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=25|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=31|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=31|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=31|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=29|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=29|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=29|ip-26-0-174-36]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=30|ip-26-0-165-24]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=30|ip-26-0-165-24]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=30|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=16|ip-26-0-164-207]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=16|ip-26-0-164-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=16|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=29|ip-26-0-165-24]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=29|ip-26-0-165-24]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=29|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=25|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=25|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=25|ip-26-0-174-36]: No checkpoint path provided. -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=31|ip-26-0-165-24]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=31|ip-26-0-165-24]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=31|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=17|ip-26-0-173-246]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=17|ip-26-0-173-246]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=17|ip-26-0-173-246]: No checkpoint path provided. -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=21|ip-26-0-173-246]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=23|ip-26-0-173-246]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=16|ip-26-0-173-246]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=16|ip-26-0-173-246]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=23|ip-26-0-173-246]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=23|ip-26-0-173-246]: No checkpoint path provided. -[default0]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=16|ip-26-0-173-246]: No checkpoint path provided. -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=21|ip-26-0-173-246]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=1|TP=21|ip-26-0-173-246]: No checkpoint path provided. -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=20|ip-26-0-164-207]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=20|ip-26-0-164-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=20|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=18|ip-26-0-164-207]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=17|ip-26-0-164-207]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=17|ip-26-0-164-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=17|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=23|ip-26-0-164-207]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=23|ip-26-0-164-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=23|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=18|ip-26-0-164-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=18|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=19|ip-26-0-164-207]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=19|ip-26-0-164-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=19|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=22|ip-26-0-164-207]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=22|ip-26-0-164-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=22|ip-26-0-164-207]: No checkpoint path provided. -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=21|ip-26-0-164-207]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=21|ip-26-0-164-207]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 04:08:25 [INFO|DP=0|PP=0|TP=21|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 04:08:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 04:08:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 04:08:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 04:08:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 04:08:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Using `datasets` library -[default0]:07/03/2024 04:08:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 04:08:27 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:08:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 04:08:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 04:08:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: -[default0]:07/03/2024 04:08:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Start training] datetime: 2024-07-03 04:08:28.604204 | mbs: 32 | grad_accum: 32 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 04:08:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 04:08:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default0]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=24|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=10|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=13|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=2|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=6|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=16|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=28|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=9|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=11|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=12|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=28|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=27|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=30|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=26|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=22|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=20|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=18|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=15|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=14|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=8|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=27|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=7|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=3|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=19|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=26|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=24|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=4|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=31|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=29|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=1|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=25|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=0|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=30|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=16|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=29|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=5|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=25|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=31|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=23|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=17|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:08:28 [WARNING|DP=0|PP=1|TP=21|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=20|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=18|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=23|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=19|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=22|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=21|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:08:28 [WARNING|DP=0|PP=0|TP=17|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:08:29 [WARNING|DP=0|PP=0|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default1]:07/03/2024 04:08:43 [WARNING|DP=0|PP=1|TP=9|ip-26-0-173-202]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:07/03/2024 04:08:43 [WARNING|DP=0|PP=1|TP=9|ip-26-0-173-202]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank35]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank35]: grad_accumulator.backward(sum(activations)) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank35]: result = loss.backward() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank35]: torch.autograd.backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank35]: _engine_run_backward( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank35]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank35]: return user_fn(self, *args) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 821, in backward -[default3]:[rank35]: dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 643, in _layer_norm_bwd -[default3]:[rank35]: _layer_norm_bwd_kernel[grid]( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in -[default3]:[rank35]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in run -[default3]:[rank35]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs} -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in -[default3]:[rank35]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs} -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 122, in _bench -[default3]:[rank35]: return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8)) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/testing.py", line 102, in do_bench -[default3]:[rank35]: fn() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 110, in kernel_call -[default3]:[rank35]: self.fn.run( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run -[default3]:[rank35]: return self.fn.run(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run -[default3]:[rank35]: return self.fn.run(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run -[default3]:[rank35]: return self.fn.run(*args, **kwargs) -[default3]:[rank35]: [Previous line repeated 2 more times] -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run -[default3]:[rank35]: self.cache[device][key] = compile( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 202, in compile -[default3]:[rank35]: return CompiledKernel(so_path, metadata_group.get(metadata_filename)) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 230, in __init__ -[default3]:[rank35]: self.asm = { -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 231, in -[default3]:[rank35]: file.suffix[1:]: file.read_bytes() if file.suffix[1:] == driver.binary_ext else file.read_text() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/pathlib.py", line 1134, in read_text -[default3]:[rank35]: with self.open(mode='r', encoding=encoding, errors=errors) as f: -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/pathlib.py", line 1119, in open -[default3]:[rank35]: return self._accessor.open(self, mode, buffering, encoding, errors, -[default3]:[rank35]: FileNotFoundError: [Errno 2] No such file or directory: '/admin/home/ferdinand_mom/.triton/cache/3bf15d0349ccda200916bdbc5ec3c269/_layer_norm_bwd_kernel.cubin.tmp.pid_1315311_776646' -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank24]: send_activation() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank24]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank24]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank24]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank24]: dist.send( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank24]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank14]: send_activation() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank14]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank14]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank14]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank14]: dist.send( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank14]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank15]: send_activation() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank15]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank15]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank15]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank15]: dist.send( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank15]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank11]: send_activation() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank11]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank11]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank11]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank11]: dist.send( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank11]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank12]: send_activation() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank12]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank12]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank12]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank12]: dist.send( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank12]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank30]: send_activation() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank30]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank30]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank30]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank30]: dist.send( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank30]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank27]: send_activation() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank27]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank27]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank27]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank27]: dist.send( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank27]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank19]: send_activation() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank19]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank19]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank19]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank19]: dist.send( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank19]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank22]: send_activation() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank22]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank22]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank22]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank22]: dist.send( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank22]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:[rank3]: send_activation() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:[rank3]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:[rank3]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank3]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:[rank3]: dist.send( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default3]:[rank3]: group.send([tensor], group_dst_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank1]: send_activation() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank1]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank1]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank1]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank1]: dist.send( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank1]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank2]: send_activation() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank2]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank2]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank2]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank2]: dist.send( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank2]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank20]: send_activation() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank20]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank20]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank20]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank20]: dist.send( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank20]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank28]: send_activation() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank28]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank28]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank28]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank28]: dist.send( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank28]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank29]: send_activation() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank29]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank29]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank29]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank29]: dist.send( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank29]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank16]: send_activation() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank16]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank16]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank16]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank16]: dist.send( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank16]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank0]: send_activation() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default0]:[rank0]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank0]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default0]:[rank0]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank0]: dist.send( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default0]:[rank0]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank5]: send_activation() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank5]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank5]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank5]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank5]: dist.send( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank5]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default6]:[rank6]: send_activation() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default6]:[rank6]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default6]:[rank6]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default6]:[rank6]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default6]:[rank6]: dist.send( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default6]:[rank6]: group.send([tensor], group_dst_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank31]: send_activation() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank31]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank31]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank31]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank31]: dist.send( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank31]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61dfad8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f61e0db1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f61e0db6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f61e0db7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f622c850e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f6231897609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f6231662353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61dfad8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f61e0db1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f61e0db6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f61e0db7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f622c850e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f6231897609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f6231662353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61dfad8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f61e0a3b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f622c850e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f6231897609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f6231662353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank7]: send_activation() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank7]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank7]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank7]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank7]: dist.send( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank7]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank25]: send_activation() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank25]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank25]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank25]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank25]: dist.send( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank25]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default4]:[rank4]: send_activation() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]:[rank4]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default4]:[rank4]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default4]:[rank4]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default4]:[rank4]: dist.send( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default4]:[rank4]: group.send([tensor], group_dst_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc8ccc19897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc8cdef2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc8cdef7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc8cdef8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fc919991e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fc91e9d8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fc91e7a3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc8ccc19897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc8cdef2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc8cdef7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc8cdef8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fc919991e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fc91e9d8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fc91e7a3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc8ccc19897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fc8cdb7c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fc919991e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fc91e9d8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fc91e7a3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde797ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fde7aaa3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fde7aaa8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fde7aaa9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fdec6542e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fdecb589609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fdecb354353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde797ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fde7aaa3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fde7aaa8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fde7aaa9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fdec6542e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fdecb589609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fdecb354353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fde797ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fde7a72d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fdec6542e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fdecb589609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fdecb354353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6179314897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f617a5edc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f617a5f2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f617a5f3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f61c608ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f61cb0d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f61cae9e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6179314897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f617a5edc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f617a5f2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f617a5f3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f61c608ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f61cb0d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f61cae9e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6179314897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f617a277119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f61c608ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f61cb0d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f61cae9e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank26]: send_activation() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank26]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank26]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank26]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank26]: dist.send( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank26]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0829b12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f082adebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f082adf0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f082adf1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f087688ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f087b8d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f087b69c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0829b12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f082adebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f082adf0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f082adf1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f087688ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f087b8d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f087b69c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0829b12897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f082aa75119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f087688ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f087b8d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f087b69c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9db2641897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7251bbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7252e97c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7252e9ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7252e9ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f729e936e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f72a397d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f72a3748353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9db391ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9db391fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9db3920dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f9dff3b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9e04400609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7251bbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #6: clone + 0x43 (0x7f9e041cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7252e97c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7252e9ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7252e9ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f729e936e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #5: + 0x8609 (0x7f72a397d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f72a3748353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7251bbe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9db2641897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f7252b21119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9db391ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9db391fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9db3920dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f9dff3b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9e04400609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9e041cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:frame #2: + 0xd3e95 (0x7f729e936e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f72a397d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9db2641897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f9db35a4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f9dff3b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f9e04400609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f9e041cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:frame #4: clone + 0x43 (0x7f72a3748353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbaeb536897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbaec80fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbaec814a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbaec815dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fbb382aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fbb3d2f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fbb3d0c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbaeb536897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbaec80fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbaec814a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbaec815dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fbb382aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fbb3d2f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fbb3d0c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbaeb536897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fbaec499119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fbb382aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fbb3d2f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fbb3d0c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35db93e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f35dcc17c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f35dcc1ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f35dcc1ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f36286b6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f362d6fd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f362d4c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35db93e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f35dcc17c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f35dcc1ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f35dcc1ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f36286b6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f362d6fd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f362d4c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f35db93e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f35dc8a1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f36286b6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f362d6fd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f362d4c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f19b57d1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f19b6aaac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f19b6aafa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f19b6ab0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1a02549e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1a07590609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1a0735b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f19b57d1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f19b6aaac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f19b6aafa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f19b6ab0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f1a02549e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f1a07590609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f1a0735b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f19b57d1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f19b6734119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f1a02549e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f1a07590609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f1a0735b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f781d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f794aec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f794b3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f794b4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2fc4f4de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2fc9f94609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2fc9d5f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f781d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f794aec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f794b3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f794b4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2fc4f4de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2fc9f94609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2fc9d5f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f781d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f2f79138119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f2fc4f4de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f2fc9f94609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f2fc9d5f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default7]:[rank23]: send_activation() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default7]:[rank23]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default7]:[rank23]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default7]:[rank23]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default7]:[rank23]: dist.send( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default7]:[rank23]: group.send([tensor], group_dst_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank17]: send_activation() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default1]:[rank17]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank17]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank17]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank17]: dist.send( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank17]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default5]:[rank21]: send_activation() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank21]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default5]:[rank21]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default5]:[rank21]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default5]:[rank21]: dist.send( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank21]: group.send([tensor], group_dst_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank18]: send_activation() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank18]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank18]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank18]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank18]: dist.send( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank18]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0c37c0c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0c38ee5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0c38eeaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0c38eebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f0c84984e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0c899cb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0c89796353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0c37c0c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0c38ee5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0c38eeaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0c38eebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f0c84984e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0c899cb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0c89796353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0c37c0c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f0c38b6f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f0c84984e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f0c899cb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f0c89796353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 19] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 29] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default2]:[rank10]: send_activation() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default2]:[rank10]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default2]:[rank10]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default2]:[rank10]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default2]:[rank10]: dist.send( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default2]:[rank10]: group.send([tensor], group_dst_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: Traceback (most recent call last): -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]: trainer.train(dataloader) -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb940a20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb941cf9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb941cfea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb941cffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fb98d798e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fb9927df609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb9925aa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb940a20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank13]: _engine_run_backward( -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb941cf9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb941cfea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb941cffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fb98d798e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:[rank8]: trainer.train(dataloader) -[default5]:[rank13]: return user_fn(self, *args) -[default7]:frame #5: + 0x8609 (0x7fb9927df609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fb9925aa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default7]: -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb940a20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fb941983119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:frame #2: + 0xd3e95 (0x7fb98d798e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: send_activation() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default5]:[rank13]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:frame #3: + 0x8609 (0x7fb9927df609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:frame #4: clone + 0x43 (0x7fb9925aa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]: -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default0]:[rank8]: result = loss.backward() -[default5]:[rank13]: dist.send( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c8ea3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c8fd15c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c8fd1aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c8fd1bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: return func(*args, **kwargs) -[default3]:frame #4: + 0xd3e95 (0x7f4cdb7b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default5]:[rank13]: group.send([tensor], group_dst_rank, tag).wait() -[default0]:[rank8]: _engine_run_backward( -[default3]:frame #5: + 0x8609 (0x7f4ce07fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:frame #6: clone + 0x43 (0x7f4ce05c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c8ea3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank8]: send_activation() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c8fd15c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c8fd1aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c8fd1bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4cdb7b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:[rank8]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default3]:frame #5: + 0x8609 (0x7f4ce07fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default3]:frame #6: clone + 0x43 (0x7f4ce05c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank8]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c8ea3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank8]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default3]:frame #1: + 0xe32119 (0x7f4c8f99f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default3]:frame #2: + 0xd3e95 (0x7f4cdb7b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:[rank8]: dist.send( -[default3]:frame #3: + 0x8609 (0x7f4ce07fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:frame #4: clone + 0x43 (0x7f4ce05c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank9]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]: -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2f7a1c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff2f8cf5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff2f8cfaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff2f8cfbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff344794e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff3497db609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:frame #6: clone + 0x43 (0x7ff3495a6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 142, in run_communication -[default1]:[rank9]: send_activation() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 22, in __call__ -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2f7a1c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff2f8cf5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank9]: self.p2p.send_tensors([self.activation], to_rank=self.to_rank) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors -[default1]:[rank9]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff2f8cfaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff2f8cfbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff344794e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors -[default1]:[rank9]: self._send_meta(tensor, to_rank=to_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta -[default1]:[rank9]: dist.send( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send -[default1]:[rank9]: group.send([tensor], group_dst_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:frame #5: + 0x8609 (0x7ff3497db609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff3495a6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2f7a1c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ff2f897f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff344794e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff3497db609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ff3495a6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f0f662897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9f1093bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9f10940a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9f10941dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f9f5c3dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f9f61421609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f9f611ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f0f662897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9f1093bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9f10940a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9f10941dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f9f5c3dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f9f61421609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f9f611ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f0f662897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f9f105c5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f9f5c3dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f9f61421609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f9f611ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f033dcbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f033ef98c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f033ef9da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f033ef9edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f038aa37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f038fa7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f038f849353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f033dcbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f033ef98c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f033ef9da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f033ef9edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f038aa37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f038fa7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f038f849353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f033dcbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f033ec22119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f038aa37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f038fa7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f038f849353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05b7d0f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05b8fe8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05b8feda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05b8feedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f0604a87e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f0609ace609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f0609899353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05b7d0f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05b8fe8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05b8feda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05b8feedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f0604a87e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f0609ace609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f0609899353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05b7d0f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f05b8c72119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f0604a87e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f0609ace609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f0609899353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7c2391897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff7c366ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff7c366fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff7c3670dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff80f109e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff814150609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff813f1b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7c2391897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff7c366ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff7c366fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff7c3670dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff80f109e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff814150609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff813f1b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff7c2391897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7ff7c32f4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7ff80f109e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7ff814150609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7ff813f1b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5399f0e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f539b1e7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f539b1eca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f539b1eddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f53e6c86e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f53ebccd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f53eba98353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5399f0e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f539b1e7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f539b1eca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f539b1eddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f53e6c86e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f53ebccd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f53eba98353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5399f0e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f539ae71119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f53e6c86e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f53ebccd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f53eba98353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5282bac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5283e85c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5283e8aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5283e8bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f52cf924e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f52d496b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f52d4736353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5282bac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5283e85c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5283e8aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5283e8bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f52cf924e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f52d496b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f52d4736353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5282bac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f5283b0f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f52cf924e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f52d496b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f52d4736353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f74b59897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8f75e32c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8f75e37a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8f75e38dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f8fc18d1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5203794897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #5: + 0x8609 (0x7f8fc6918609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f8fc66e3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5204a6dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5204a72a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f74b59897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8f75e32c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8f75e37a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8f75e38dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f8fc18d1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5204a73dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f525050ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5255553609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #5: + 0x8609 (0x7f8fc6918609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f525531e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #6: clone + 0x43 (0x7f8fc66e3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f74b59897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #1: + 0xe32119 (0x7f8f75abc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f8fc18d1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f8fc6918609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5203794897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #4: clone + 0x43 (0x7f8fc66e3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5204a6dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5204a72a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5204a73dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f525050ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5255553609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f525531e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5203794897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f52046f7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f525050ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f5255553609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f525531e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fad66438897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fad67711c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fad67716a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fad67717dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fadb31b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fadb81f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fadb7fc2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fad66438897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fad67711c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fad67716a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fad67717dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fadb31b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fadb81f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fadb7fc2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fad66438897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fad6739b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fadb31b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fadb81f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fadb7fc2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdae4baa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdae5e83c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdae5e88a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdae5e89dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fdb31922e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fdb36969609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fdb36734353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdae4baa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdae5e83c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdae5e88a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdae5e89dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fdb31922e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fdb36969609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fdb36734353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdae4baa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fdae5b0d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fdb31922e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fdb36969609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fdb36734353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3b99c2d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3b9af06c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3b9af0ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3b9af0cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3be69a5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3beb9ec609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3beb7b7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3b99c2d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3b9af06c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3b9af0ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3b9af0cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3be69a5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3beb9ec609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3beb7b7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3b99c2d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f3b9ab90119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f3be69a5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f3beb9ec609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f3beb7b7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd01ea53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd01fd2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd01fd31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd01fd32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fd06b7cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fd070812609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fd0705dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd01ea53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd01fd2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd01fd31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd01fd32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fd06b7cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fd070812609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fd0705dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd01ea53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fd01f9b6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fd06b7cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fd070812609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fd0705dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f60a1b63897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f60a2e3cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f60a2e41a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f60a2e42dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f60ee8dbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f60f3922609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f60f36ed353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f60a1b63897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f60a2e3cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f60a2e41a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f60a2e42dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f60ee8dbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f60f3922609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f60f36ed353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f60a1b63897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f60a2ac6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f60ee8dbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f60f3922609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f60f36ed353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e01367897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3e02640c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3e02645a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3e02646dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3e4e0dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3e53126609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3e52ef1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e01367897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3e02640c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3e02645a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3e02646dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3e4e0dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3e53126609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3e52ef1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e01367897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f3e022ca119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f3e4e0dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f3e53126609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f3e52ef1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a0b686897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2a0c95fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2a0c964a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2a0c965dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2a583fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2a5d445609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2a5d210353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a0b686897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2a0c95fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2a0c964a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2a0c965dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2a583fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2a5d445609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2a5d210353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2a0b686897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f2a0c5e9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f2a583fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f2a5d445609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f2a5d210353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcbca71d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcbcb9f6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcbcb9fba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcbcb9fcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fcc17495e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fcc1c4dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fcc1c2a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcbca71d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcbcb9f6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcbcb9fba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcbcb9fcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fcc17495e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fcc1c4dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fcc1c2a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcbca71d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fcbcb680119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fcc17495e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fcc1c4dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fcc1c2a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 11, last enqueued NCCL work: 12, last completed NCCL work: 10. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa28fe86897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa29115fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa291164a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa291165dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa2dcbfee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa2e1c45609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa2e1a10353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=11, OpType=SEND, NumelIn=6, NumelOut=6, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa28fe86897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa29115fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa291164a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa291165dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa2dcbfee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa2e1c45609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa2e1a10353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa28fe86897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fa290de9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fa2dcbfee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fa2e1c45609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fa2e1a10353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 26] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 26] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 26] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 26] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7eff9ec84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7eff9ff5dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7eff9ff62a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7eff9ff63dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7effeb9fce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7efff0a43609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7efff080e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 26] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7eff9ec84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7eff9ff5dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7eff9ff62a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7eff9ff63dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7effeb9fce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7efff0a43609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7efff080e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7eff9ec84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7eff9fbe7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7effeb9fce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7efff0a43609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7efff080e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 30] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 30] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 30] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 30] Process group watchdog thread terminated with exception: [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05293d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f052a6b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f052a6b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f052a6b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f0576151e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f057b198609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f057af63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 30] Process group watchdog thread terminated with exception: [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05293d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f052a6b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f052a6b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f052a6b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f0576151e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f057b198609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f057af63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05293d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f052a33c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f0576151e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f057b198609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f057af63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 31] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 31] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 31] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 31] Process group watchdog thread terminated with exception: [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00d7a10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f00d8ce9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f00d8ceea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f00d8cefdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f0124788e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f01297cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f012959a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 31] Process group watchdog thread terminated with exception: [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00d7a10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f00d8ce9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f00d8ceea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f00d8cefdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f0124788e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f01297cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f012959a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00d7a10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f00d8973119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f0124788e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f01297cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f012959a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 27] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 27] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 27] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 27] Process group watchdog thread terminated with exception: [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0a363d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe0a4916c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe0a491ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe0a491cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fe0f03b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fe0f53fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fe0f51c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 27] Process group watchdog thread terminated with exception: [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 24] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0a363d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe0a4916c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe0a491ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe0a491cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 24] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:frame #4: + 0xd3e95 (0x7fe0f03b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 24] To avoid data inconsistency, we are taking the entire process down. -[default3]:frame #5: + 0x8609 (0x7fe0f53fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 24] Process group watchdog thread terminated with exception: [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default3]:frame #6: clone + 0x43 (0x7fe0f51c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]: -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f678b58c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f678c865c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f678c86aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f678c86bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f67d8304e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f67dd34b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f67dd116353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 24] Process group watchdog thread terminated with exception: [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0a363d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fe0a45a0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fe0f03b5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fe0f53fc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fe0f51c7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f678b58c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f678c865c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f678c86aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f678c86bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f67d8304e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f67dd34b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f67dd116353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f678b58c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f678c4ef119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f67d8304e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f67dd34b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f67dd116353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 29] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 29] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 25] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 29] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 29] Process group watchdog thread terminated with exception: [Rank 29] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 25] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 25] To avoid data inconsistency, we are taking the entire process down. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe061033897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe06230cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 25] Process group watchdog thread terminated with exception: [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe062311a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe062312dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe0addabe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6ca8039897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #5: + 0x8609 (0x7fe0b2df2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe0b2bbd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6ca9312c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]: -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6ca9317a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6ca9318dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]:frame #4: + 0xd3e95 (0x7f6cf4db1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]: what(): [PG 2 Rank 29] Process group watchdog thread terminated with exception: [Rank 29] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe061033897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe06230cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #5: + 0x8609 (0x7f6cf9df8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f6cf9bc3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 25] Process group watchdog thread terminated with exception: [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe062311a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe062312dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6ca8039897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6ca9312c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6ca9317a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6ca9318dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f6cf4db1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f6cf9df8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f6cf9bc3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:frame #4: + 0xd3e95 (0x7fe0addabe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe0b2df2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6ca8039897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #6: clone + 0x43 (0x7fe0b2bbd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:frame #1: + 0xe32119 (0x7f6ca8f9c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe061033897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fe061f96119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f6cf4db1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #2: + 0xd3e95 (0x7fe0addabe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f6cf9df8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f6cf9bc3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:frame #3: + 0x8609 (0x7fe0b2df2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fe0b2bbd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 28] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 28] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 28] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 28] Process group watchdog thread terminated with exception: [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f88d5ab1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f88d6d8ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f88d6d8fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f88d6d90dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f8922829e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f8927870609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f892763b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 28] Process group watchdog thread terminated with exception: [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f88d5ab1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f88d6d8ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f88d6d8fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f88d6d90dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f8922829e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f8927870609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f892763b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f88d5ab1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f88d6a14119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f8922829e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f8927870609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f892763b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 16] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 16] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 16] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 16] Process group watchdog thread terminated with exception: [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0b996cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 21] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 21] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 21] To avoid data inconsistency, we are taking the entire process down. -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0b9a9a5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 21] Process group watchdog thread terminated with exception: [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0b9a9aaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3784459897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3785732c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3785737a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3785738dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f37d11d1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f37d6218609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f37d5fe3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 21] Process group watchdog thread terminated with exception: [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3784459897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3785732c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3785737a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3785738dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f37d11d1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f37d6218609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f37d5fe3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0b9a9abdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f0be6444e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #5: + 0x8609 (0x7f0beb48b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3784459897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #6: clone + 0x43 (0x7f0beb256353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #1: + 0xe32119 (0x7f37853bc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f37d11d1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:frame #3: + 0x8609 (0x7f37d6218609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f37d5fe3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]: what(): [PG 2 Rank 16] Process group watchdog thread terminated with exception: [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0b996cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0b9a9a5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0b9a9aaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0b9a9abdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f0be6444e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f0beb48b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f0beb256353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0b996cc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f0b9a62f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f0be6444e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f0beb48b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f0beb256353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 22] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 22] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 22] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 22] Process group watchdog thread terminated with exception: [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7b83eab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7b85184c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7b85189a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7b8518adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7bd0c23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f7bd5c6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7bd5a35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 22] Process group watchdog thread terminated with exception: [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7b83eab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7b85184c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7b85189a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7b8518adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7bd0c23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f7bd5c6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7bd5a35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7b83eab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f7b84e0e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f7bd0c23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f7bd5c6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f7bd5a35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 23] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 23] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 23] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 23] Process group watchdog thread terminated with exception: [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5314eba897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5316193c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5316198a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5316199dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5361c32e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5366c79609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5366a44353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 23] Process group watchdog thread terminated with exception: [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5314eba897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5316193c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5316198a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5316199dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5361c32e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5366c79609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5366a44353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5314eba897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f5315e1d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f5361c32e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f5366c79609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f5366a44353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 17] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 17] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 17] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 17] Process group watchdog thread terminated with exception: [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff152586897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff15385fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff153864a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff153865dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7ff19f2fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7ff1a4345609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7ff1a4110353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 17] Process group watchdog thread terminated with exception: [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff152586897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff15385fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff153864a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff153865dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7ff19f2fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7ff1a4345609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7ff1a4110353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff152586897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7ff1534e9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7ff19f2fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7ff1a4345609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7ff1a4110353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 18] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 18] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 18] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 18] Process group watchdog thread terminated with exception: [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f190e6f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f190f9cfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f190f9d4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f190f9d5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f195b46ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f19604b5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1960280353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 18] Process group watchdog thread terminated with exception: [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f190e6f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f190f9cfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f190f9d4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f190f9d5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f195b46ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f19604b5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1960280353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f190e6f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f190f659119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f195b46ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f19604b5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f1960280353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 20] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 20] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 20] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 20] Process group watchdog thread terminated with exception: [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6d7a044897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6d7b31dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6d7b322a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6d7b323dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6dc6dbce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f6dcbe03609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6dcbbce353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 20] Process group watchdog thread terminated with exception: [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6d7a044897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6d7b31dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6d7b322a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6d7b323dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6dc6dbce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f6dcbe03609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6dcbbce353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6d7a044897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f6d7afa7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f6dc6dbce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f6dcbe03609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f6dcbbce353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 9] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 9] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 9] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f1cccc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f1dfa5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f1dfaaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f1dfabdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5f69a44e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5f6ea8b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5f6e856353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f1cccc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f1dfa5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f1dfaaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f1dfabdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5f69a44e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5f6ea8b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5f6e856353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f1cccc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f5f1dc2f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f5f69a44e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f5f6ea8b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f5f6e856353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 12] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 12] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 12] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 11] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 11] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 11] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4cf76a3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4cf897cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4cf8981a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4cf8982dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4d4441be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec292ad897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 15] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default3]:frame #5: + 0x8609 (0x7f4d49462609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fec2a586c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #6: clone + 0x43 (0x7f4d4922d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 15] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fec2a58ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fec2a58cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 15] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]: what(): [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #4: + 0xd3e95 (0x7fec76025e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4bd32ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #5: + 0x8609 (0x7fec7b06c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4cf76a3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4bd45c7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4bd45cca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4cf897cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4bd45cddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4cf8981a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4c20066e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4c250ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fec7ae37353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4cf8982dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]: -[default3]:frame #4: + 0xd3e95 (0x7f4d4441be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f4d49462609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4c24e78353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]:frame #6: clone + 0x43 (0x7f4d4922d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec292ad897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4cf76a3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]: what(): [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fec2a586c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #1: + 0xe32119 (0x7f4cf8606119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fec2a58ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4bd32ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4bd45c7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fec2a58cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fec76025e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4bd45cca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #5: + 0x8609 (0x7fec7b06c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #2: + 0xd3e95 (0x7f4d4441be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4bd45cddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #6: clone + 0x43 (0x7fec7ae37353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #3: + 0x8609 (0x7f4d49462609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: + 0xd3e95 (0x7f4c20066e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: -[default7]:frame #5: + 0x8609 (0x7f4c250ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f4d4922d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:frame #6: clone + 0x43 (0x7f4c24e78353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec292ad897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fec2a210119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fec76025e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fec7b06c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4bd32ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4bd4251119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: clone + 0x43 (0x7fec7ae37353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 8] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default7]:frame #2: + 0xd3e95 (0x7f4c20066e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 8] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:frame #3: + 0x8609 (0x7f4c250ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 8] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1e313e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff1e4417c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff1e441ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff1e441ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff22feb6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff234efd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4c24e78353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #6: clone + 0x43 (0x7ff234cc8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: -[default0]: what(): [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1e313e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff1e4417c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff1e441ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff1e441ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff22feb6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff234efd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff234cc8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1e313e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7ff1e40a1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7ff22feb6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7ff234efd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7ff234cc8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 13] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 13] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 13] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b3aa67897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b3bd40c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b3bd45a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b3bd46dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1b877dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1b8c826609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1b8c5f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b3aa67897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b3bd40c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b3bd45a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b3bd46dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1b877dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1b8c826609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1b8c5f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b3aa67897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f1b3b9ca119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f1b877dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f1b8c826609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f1b8c5f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 14] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 14] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 14] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f27e9c4a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f27eaf23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f27eaf28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f27eaf29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f28369c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f283ba09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f283b7d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f27e9c4a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f27eaf23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f27eaf28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f27eaf29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f28369c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f283ba09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f283b7d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f27e9c4a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f27eabad119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f28369c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f283ba09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f283b7d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 10] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 10] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 10] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4bb8b25897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4bb9dfec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4bb9e03a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4bb9e04dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4c0589de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4c0a8e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4c0a6af353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4bb8b25897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4bb9dfec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4bb9e03a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4bb9e04dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4c0589de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4c0a8e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4c0a6af353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4bb8b25897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f4bb9a88119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f4c0589de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f4c0a8e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f4c0a6af353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 19] Timeout at NCCL work: 69, last enqueued NCCL work: 69, last completed NCCL work: 68. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 19] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 19] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 19] Process group watchdog thread terminated with exception: [Rank 19] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb863ccc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb864fa5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb864faaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb864fabdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb8b0a44e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb8b5a8b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb8b5856353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 19] Process group watchdog thread terminated with exception: [Rank 19] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=69, OpType=_ALLGATHER_BASE, NumelIn=8388608, NumelOut=268435456, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb863ccc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb864fa5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb864faaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb864fabdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb8b0a44e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb8b5a8b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb8b5856353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb863ccc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fb864c2f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fb8b0a44e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fb8b5a8b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fb8b5856353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -W0703 04:19:34.443000 140430287660864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1670730 closing signal SIGTERM -W0703 04:19:34.444000 140430287660864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1670731 closing signal SIGTERM -W0703 04:19:34.444000 140430287660864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1670732 closing signal SIGTERM -W0703 04:19:34.444000 140430287660864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1670733 closing signal SIGTERM -E0703 04:19:35.464000 140430287660864 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1670726) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:19:34 - host : ip-26-0-162-233.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1670727) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1670727 -[2]: - time : 2024-07-03_04:19:34 - host : ip-26-0-162-233.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1670728) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1670728 -[3]: - time : 2024-07-03_04:19:34 - host : ip-26-0-162-233.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1670729) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1670729 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:19:34 - host : ip-26-0-162-233.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1670726) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1670726 -============================================================ -srun: error: ip-26-0-162-233: task 0: Exited with exit code 1 -W0703 04:19:38.196000 140120415590144 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-174-36.ec2.internal_846724_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:38.269000 140269987243776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_417204_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:38.991000 140423078258432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-202.ec2.internal_1315236_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:39.103000 139904079726336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_906642_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:39.121000 140706903525120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-246.ec2.internal_333077_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:39.149000 140698831050496 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-147.ec2.internal_804157_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:39.183000 140509718718208 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-166-125.ec2.internal_32970_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:39.405000 140428738991936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1315306 closing signal SIGTERM -W0703 04:19:39.406000 140428738991936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1315307 closing signal SIGTERM -W0703 04:19:39.406000 140428738991936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1315308 closing signal SIGTERM -W0703 04:19:39.406000 140428738991936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1315309 closing signal SIGTERM -W0703 04:19:39.406000 140428738991936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1315310 closing signal SIGTERM -W0703 04:19:39.406000 140428738991936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1315311 closing signal SIGTERM -W0703 04:19:39.406000 140428738991936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1315312 closing signal SIGTERM -W0703 04:19:39.406000 140428738991936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1315313 closing signal SIGTERM -W0703 04:19:39.403000 140515379451712 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 33040 closing signal SIGTERM -W0703 04:19:39.404000 140515379451712 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 33041 closing signal SIGTERM -W0703 04:19:39.404000 140515379451712 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 33042 closing signal SIGTERM -W0703 04:19:39.404000 140515379451712 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 33043 closing signal SIGTERM -W0703 04:19:39.404000 140515379451712 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 33044 closing signal SIGTERM -W0703 04:19:39.407000 140515379451712 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 33045 closing signal SIGTERM -W0703 04:19:39.409000 139909740459840 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 906716 closing signal SIGTERM -W0703 04:19:39.409000 140515379451712 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 33046 closing signal SIGTERM -W0703 04:19:39.409000 140515379451712 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 33047 closing signal SIGTERM -W0703 04:19:39.420000 140712564258624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 333147 closing signal SIGTERM -W0703 04:19:39.420000 140712564258624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 333148 closing signal SIGTERM -W0703 04:19:39.420000 140712564258624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 333149 closing signal SIGTERM -W0703 04:19:39.420000 140712564258624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 333150 closing signal SIGTERM -W0703 04:19:39.420000 140712564258624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 333151 closing signal SIGTERM -W0703 04:19:39.420000 140712564258624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 333152 closing signal SIGTERM -W0703 04:19:39.420000 140712564258624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 333153 closing signal SIGTERM -W0703 04:19:39.420000 140712564258624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 333154 closing signal SIGTERM -W0703 04:19:39.440000 140275647977280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 417276 closing signal SIGTERM -W0703 04:19:39.440000 140275647977280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 417277 closing signal SIGTERM -W0703 04:19:39.440000 140275647977280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 417280 closing signal SIGTERM -W0703 04:19:39.440000 140275647977280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 417282 closing signal SIGTERM -W0703 04:19:39.441000 140704491784000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 804228 closing signal SIGTERM -W0703 04:19:39.441000 140704491784000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 804229 closing signal SIGTERM -W0703 04:19:39.441000 140704491784000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 804230 closing signal SIGTERM -W0703 04:19:39.441000 140704491784000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 804231 closing signal SIGTERM -W0703 04:19:39.442000 140704491784000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 804232 closing signal SIGTERM -W0703 04:19:39.442000 140704491784000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 804233 closing signal SIGTERM -W0703 04:19:39.442000 140704491784000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 804235 closing signal SIGTERM -E0703 04:19:39.629000 139909740459840 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 906712) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 04:19:39.641000 139909740459840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_906642_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 04:19:39.648000 140126076323648 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 846793) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 04:19:39.660000 140126076323648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_846724_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:39.671000 139909740459840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_906642_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:39.694000 140126076323648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_846724_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:39.697000 139909740459840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_906642_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:19:39 - host : ip-26-0-165-24.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 906713) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 906713 -[2]: - time : 2024-07-03_04:19:39 - host : ip-26-0-165-24.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 906714) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 906714 -[3]: - time : 2024-07-03_04:19:39 - host : ip-26-0-165-24.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 906715) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 906715 -[4]: - time : 2024-07-03_04:19:39 - host : ip-26-0-165-24.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 906717) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 906717 -[5]: - time : 2024-07-03_04:19:39 - host : ip-26-0-165-24.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 906718) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 906718 -[6]: - time : 2024-07-03_04:19:39 - host : ip-26-0-165-24.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 906719) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 906719 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:19:39 - host : ip-26-0-165-24.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 906712) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 906712 -============================================================ -W0703 04:19:39.724000 140126076323648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_846724_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:19:39 - host : ip-26-0-174-36.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 846794) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846794 -[2]: - time : 2024-07-03_04:19:39 - host : ip-26-0-174-36.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 846795) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846795 -[3]: - time : 2024-07-03_04:19:39 - host : ip-26-0-174-36.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 846796) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846796 -[4]: - time : 2024-07-03_04:19:39 - host : ip-26-0-174-36.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 846797) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846797 -[5]: - time : 2024-07-03_04:19:39 - host : ip-26-0-174-36.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 846798) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846798 -[6]: - time : 2024-07-03_04:19:39 - host : ip-26-0-174-36.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 846799) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846799 -[7]: - time : 2024-07-03_04:19:39 - host : ip-26-0-174-36.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 846800) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846800 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:19:39 - host : ip-26-0-174-36.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 846793) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 846793 -============================================================ -srun: error: ip-26-0-174-36: task 7: Exited with exit code 1 -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -E0703 04:19:40.849000 140275647977280 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 417275) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 04:19:40.862000 140275647977280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_417204_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:40.893000 140275647977280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_417204_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:40.910000 140275647977280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_417204_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:19:39 - host : ip-26-0-164-207.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 417278) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 417278 -[2]: - time : 2024-07-03_04:19:39 - host : ip-26-0-164-207.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 417279) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 417279 -[3]: - time : 2024-07-03_04:19:39 - host : ip-26-0-164-207.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 417281) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 417281 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:19:39 - host : ip-26-0-164-207.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 417275) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 417275 -============================================================ -E0703 04:19:41.570000 140704491784000 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 6 (pid: 804234) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 04:19:41.582000 140704491784000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_804157_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:41.615000 140704491784000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_804157_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:41.625000 140704491784000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_804157_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:19:39 - host : ip-26-0-163-147.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 804234) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 804234 -============================================================ -srun: error: ip-26-0-164-207: task 2: Exited with exit code 1 -srun: error: ip-26-0-163-147: task 1: Exited with exit code 1 -W0703 04:19:42.936000 140712564258624 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_333077_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:42.954000 140712564258624 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_333077_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 04:19:43.962000 140428738991936 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1315236_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:43.978000 140428738991936 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1315236_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store -srun: error: ip-26-0-173-246: task 6: Exited with exit code 1 - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 04:19:44.188000 140509718718208 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-166-125.ec2.internal_32970_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-173-202: task 5: Exited with exit code 1 -W0703 04:19:49.192000 140509718718208 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-166-125.ec2.internal_32970_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:51.756000 140515379451712 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_32970_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:19:51.772000 140515379451712 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_32970_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-166-125: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-32/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/bench.slurm deleted file mode 100644 index 8dcff4b7b8cafb17e7e644a4d82eaf15c6289968..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/config.yaml deleted file mode 100644 index e7a53ac20258fb46fadea2ce9e1c6429bc3e45fb..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 256 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 4 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/log.out deleted file mode 100644 index 50520650ec682dc4ba32f2b2edce4e516d7a2088..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/log.out +++ /dev/null @@ -1,5810 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:32:34 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:32:36.565000 139741875156800 torch/distributed/run.py:757] -W0703 03:32:36.565000 139741875156800 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.565000 139741875156800 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:32:36.565000 139741875156800 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.564000 140623480780608 torch/distributed/run.py:757] -W0703 03:32:36.564000 140623480780608 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.564000 140623480780608 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:32:36.564000 140623480780608 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.569000 140161389905728 torch/distributed/run.py:757] -W0703 03:32:36.569000 140161389905728 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.569000 140161389905728 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:32:36.569000 140161389905728 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.579000 140503982675776 torch/distributed/run.py:757] -W0703 03:32:36.579000 140503982675776 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.579000 140503982675776 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:32:36.579000 140503982675776 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.584000 140075254286144 torch/distributed/run.py:757] -W0703 03:32:36.584000 140075254286144 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.584000 140075254286144 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:32:36.584000 140075254286144 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.595000 139724110952256 torch/distributed/run.py:757] -W0703 03:32:36.595000 139724110952256 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.595000 139724110952256 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:32:36.595000 139724110952256 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.606000 140553412921152 torch/distributed/run.py:757] -W0703 03:32:36.606000 140553412921152 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.606000 140553412921152 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:32:36.606000 140553412921152 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.612000 139840535078720 torch/distributed/run.py:757] -W0703 03:32:36.612000 139840535078720 torch/distributed/run.py:757] ***************************************** -W0703 03:32:36.612000 139840535078720 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:32:36.612000 139840535078720 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:32:56 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config: -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: run='%date_%jobid', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: step=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: consumed_train_samples=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: benchmark_csv_path=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp=2, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp=32, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp_engine=, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_mode=, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: expert_parallel_size=1), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50272), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_revision=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_max_length=None), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoint_interval=100000, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: save_initial_state=False, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: log_level_replica='info', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: train_steps=20, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: micro_batch_size=4, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: batch_accumulation_per_replica=256, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: val_check_interval=-1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_val_batches=0, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_test_batches=0), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta1=0.9, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta2=0.95, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: name='adamW'), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: zero_stage=1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: weight_decay=0.01, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: clip_grad=1.0, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_steps=1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_style='linear', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_style='linear', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_steps=19, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: start_training_step=1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_splits='train', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: text_column_name='text'), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_loading_workers=0))], -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4')), -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lighteval=None) -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Model Config: -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50272) -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Building model.. -[default0]:07/03/2024 03:32:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Setting PP block ranks... -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: No checkpoint path provided. -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: No checkpoint path provided. -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: No checkpoint path provided. -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Parametrizing model parameters using StandardParametrizator -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: No checkpoint path provided. -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 03:33:14 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 03:33:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:33:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:33:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 03:33:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:33:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Using `datasets` library -[default0]:07/03/2024 03:33:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 03:33:16 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:33:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:33:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:33:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: -[default0]:07/03/2024 03:33:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Start training] datetime: 2024-07-03 03:33:17.463550 | mbs: 4 | grad_accum: 256 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:33:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:33:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default3]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=27|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=25|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=2|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=4|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=3|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=21|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=19|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=7|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=5|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=6|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=3|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=1|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=28|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=24|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=31|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=29|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=26|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=26|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=18|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=23|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=17|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=16|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=22|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=20|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=19|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=28|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=1|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=24|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=0|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=31|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=7|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=5|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=6|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=2|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=30|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=29|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=23|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=25|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=27|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=4|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=30|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=16|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=18|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=21|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=17|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=20|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:33:17 [WARNING|DP=0|PP=1|TP=22|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:33:17 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: pipeline_state.run_communication() -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: Traceback (most recent call last): -[default6]:[rank6]: dist.recv( -[default3]:[rank3]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default3]:[rank3]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank29]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: return user_fn(self, *args) -[default2]:[rank26]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: return func(*args, **kwargs) -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank19]: dist.recv( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3651887897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3652b60c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3652b65a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3652b66dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f369e5ffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f36a3646609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f36a3411353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3651887897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3652b60c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3652b65a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3652b66dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f369e5ffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f36a3646609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f36a3411353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3651887897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f36527ea119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f369e5ffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f36a3646609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f36a3411353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b613a3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6b6267cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6b62681a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6b62682dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f6bae11be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f6bb3162609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f6bb2f2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b613a3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6b6267cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6b62681a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6b62682dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f6bae11be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f6bb3162609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f6bb2f2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b613a3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f6b62306119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f6bae11be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f6bb3162609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f6bb2f2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efff82de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efff95b7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efff95bca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efff95bddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f0045056e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f004a09d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f0049e68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efff82de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efff95b7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efff95bca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efff95bddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f0045056e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f004a09d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f0049e68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efff82de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7efff9241119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f0045056e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f004a09d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f0049e68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba677c7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fba68aa0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fba68aa5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fba68aa6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fbab453fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fbab9586609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fbab9351353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba677c7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fba68aa0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fba68aa5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fba68aa6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fbab453fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fbab9586609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fbab9351353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba677c7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fba6872a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fbab453fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fbab9586609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fbab9351353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec7ba01897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fec7ccdac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fec7ccdfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fec7cce0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fecc8779e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7feccd7c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7feccd58b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec7ba01897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fec7ccdac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fec7ccdfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fec7cce0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fecc8779e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7feccd7c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7feccd58b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec7ba01897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fec7c964119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fecc8779e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7feccd7c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7feccd58b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f65fe8d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f65ffbabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f65ffbb0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f65ffbb1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f664b64ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f6650691609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f665045c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f65fe8d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f65ffbabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f65ffbb0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f65ffbb1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f664b64ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f6650691609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f665045c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f65fe8d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f65ff835119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f664b64ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f6650691609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f665045c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faee99ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faeeac85c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faeeac8aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faeeac8bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7faf36724e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7faf3b76b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7faf3b536353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faee99ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faeeac85c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faeeac8aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faeeac8bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7faf36724e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7faf3b76b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7faf3b536353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faee99ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7faeea90f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7faf36724e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7faf3b76b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7faf3b536353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1853745897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1854a1ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1854a23a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd522eb1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1854a24dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f18a04bde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd52418ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #5: + 0x8609 (0x7f18a5504609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f18a52cf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd52418fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd524190dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd56fc29e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1853745897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1854a1ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1854a23a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1854a24dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f18a04bde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd574c70609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd574a3b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #5: + 0x8609 (0x7f18a5504609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f18a52cf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1853745897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #1: + 0xe32119 (0x7f18546a8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f18a04bde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f18a5504609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default2]:frame #4: clone + 0x43 (0x7f18a52cf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]: -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd522eb1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd52418ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd52418fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd524190dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd56fc29e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd574c70609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd574a3b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd522eb1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fd523e14119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fd56fc29e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fd574c70609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fd574a3b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb87a535897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb87b80ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb87b813a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb87b814dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb8c72ade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb8cc2f4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb8cc0bf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb87a535897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb87b80ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb87b813a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb87b814dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fb8c72ade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fb8cc2f4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fb8cc0bf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb87a535897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fb87b498119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fb8c72ade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fb8cc2f4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fb8cc0bf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab81586897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab8285fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab82864a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab82865dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fabce2fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fabd3345609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fabd3110353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab81586897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab8285fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab82864a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab82865dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fabce2fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fabd3345609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fabd3110353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab81586897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fab824e9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fabce2fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fabd3345609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fabd3110353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default7]:[rank15]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47abcef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47acfc8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47acfcda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47acfcedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f47f8a67e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f47fdaae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f47fd879353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47abcef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47acfc8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47acfcda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47acfcedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f47f8a67e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f47fdaae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f47fd879353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47abcef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f47acc52119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f47f8a67e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f47fdaae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f47fd879353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default6]:[rank30]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: torch.autograd.backward( -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default6]:[rank30]: dist.recv( -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f939205d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f308982f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank30]: return func(*args, **kwargs) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9393336c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f308ab08c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f939333ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59952fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59965d4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59965d9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59965dadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f59e2073e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:frame #5: + 0x8609 (0x7f59e70ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default1]:frame #6: clone + 0x43 (0x7f59e6e85353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59952fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59965d4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59965d9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59965dadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f59e2073e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f59e70ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f59e6e85353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59952fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f599625e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f59e2073e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f59e70ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f59e6e85353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3fa040f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fa16e8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fa16eda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fa16eedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f3fed187e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f3ff21ce609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f3ff1f99353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3fa040f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fa16e8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fa16eda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fa16eedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f3fed187e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f3ff21ce609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f3ff1f99353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3fa040f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f3fa1372119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f3fed187e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f3ff21ce609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f3ff1f99353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f783[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f27232c4897 b60b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f783c8e4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f783c8e9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f783c8eadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7888383e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f788d3ca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f788d195353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f783b60b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packin /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f272459dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f27245a2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f27245a3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f277003ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -ages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f783c8e4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f783c8e9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f783c8eadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7888383e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f788d3ca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f788d195353 in /lib/x86_64-linux-[default0]:frame #5: + 0x8609 (0x7f2775083609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2774e4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f27232c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional + 0xe32119 (0x7f783c56e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f7888383e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -000l> > >) + 0x1d2 (0x7f272459dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f27245a2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f27245a3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f277003ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2775083609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2774e4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent ca[default6]:frame #3: + 0x8609 (0x7f788d3ca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f788d195353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -ll first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f27232c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f2724227119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f277003ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f2775083609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f2774e4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe198c94897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe199f6dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe199f72a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe199f73dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe1e5a0ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fe1eaa53609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe1ea81e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe198c94897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe199f6dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe199f72a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe199f73dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe1e5a0ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fe1eaa53609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe1ea81e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe198c94897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fe199bf7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fe1e5a0ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fe1eaa53609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fe1ea81e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f78aa103897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f78ab3dcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f78ab3e1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f78ab3e2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f78f6e7be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f78fbec2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f78fbc8d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f78aa103897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f308ab0da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f939333cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:frame #4: + 0xd3e95 (0x7f93dedd5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f308ab0edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:frame #4: + 0xd3e95 (0x7f30d65a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:frame #5: + 0x8609 (0x7f30db5ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f30db3b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:frame #5: + 0x8609 (0x7f93e3e1c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default2]:frame #6: clone + 0x43 (0x7f93e3be7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f308982f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f308ab08c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f308ab0da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f308ab0edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f30d65a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f30db5ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:frame #6: clone + 0x43 (0x7f30db3b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f308982f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f308a792119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f30d65a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default5]:frame #3: + 0x8609 (0x7f30db5ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f939205d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #4: clone + 0x43 (0x7f30db3b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9393336c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f939333ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f939333cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f93dedd5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f93e3e1c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f93e3be7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f939205d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f9392fc0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f93dedd5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f93e3e1c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f93e3be7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff12ffeb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff1312c4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff1312c9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff1312cadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff17cd63e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff181daa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff181b75353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff12ffeb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff1312c4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff1312c9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff1312cadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff17cd63e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff181daa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff181b75353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff12ffeb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ff130f4e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff17cd63e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff181daa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ff181b75353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank24]: dist.recv( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc5dc10a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc5dd3e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc5dd3e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc5dd3e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fc628e82e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fc62dec9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fc62dc94353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc5dc10a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc5dd3e3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc5dd3e8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc5dd3e9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fc628e82e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fc62dec9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fc62dc94353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc5dc10a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fc5dd06d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fc628e82e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fc62dec9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fc62dc94353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f78ab3dcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f78ab3e1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f78ab3e2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f78f6e7be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f78fbec2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f78fbc8d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3dc30f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3dc43cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3dc43cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3dc43d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3e0fe69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3e14eb0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3e14c7b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3dc30f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f78aa103897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f78ab066119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f78f6e7be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f78fbec2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f78fbc8d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3dc43cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3dc43cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3dc43d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3e0fe69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]: -[default5]:frame #5: + 0x8609 (0x7f3e14eb0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3e14c7b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3dc30f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f3dc4054119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f3e0fe69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f3e14eb0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f3e14c7b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc71e9f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc73178c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc7317da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc7317edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7efcbec17e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7efcc3c5e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7efcc3a29353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc71e9f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc73178c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc7317da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc7317edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7efcbec17e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7efcc3c5e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7efcc3a29353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc71e9f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7efc72e02119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7efcbec17e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7efcc3c5e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7efcc3a29353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5c40481897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5c4175ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5c4175fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5c41760dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5c8d1f9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5c92240609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5c9200b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5c40481897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5c4175ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5c4175fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5c41760dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5c8d1f9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5c92240609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5c9200b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5c40481897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f5c413e4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f5c8d1f9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f5c92240609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f5c9200b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff556dfd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff5580d6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff5580dba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff5580dcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff5a3b75e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff5a8bbc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:frame #6: clone + 0x43 (0x7ff5a8987353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff556dfd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efe804b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efe81789c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff5580d6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efe8178ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efe8178fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff5580dba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7efecd228e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff5580dcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #5: + 0x8609 (0x7efed226f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: + 0xd3e95 (0x7ff5a3b75e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #6: clone + 0x43 (0x7efed203a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #5: + 0x8609 (0x7ff5a8bbc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff5a8987353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efe804b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efe81789c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efe8178ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff556dfd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efe8178fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7efecd228e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #1: + 0xe32119 (0x7ff557d60119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff5a3b75e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff5a8bbc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #5: + 0x8609 (0x7efed226f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7efed203a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:frame #4: clone + 0x43 (0x7ff5a8987353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efe804b0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]: -[default1]:frame #1: + 0xe32119 (0x7efe81413119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7efecd228e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7efed226f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7efed203a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d76a64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9d77d3dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9d77d42a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9d77d43dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f9dc37dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f9dc8823609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f9dc85ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d76a64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9d77d3dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9d77d42a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9d77d43dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f9dc37dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f9dc8823609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f9dc85ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d76a64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f9d779c7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f9dc37dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f9dc8823609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f9dc85ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa95db7b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa95ee54c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa95ee59a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa95ee5adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa9aa8f3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fa9af93a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fa9af705353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa95db7b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa95ee54c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa95ee59a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa95ee5adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa9aa8f3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fa9af93a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fa9af705353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa95db7b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fa95eade119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fa9aa8f3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fa9af93a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fa9af705353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f174ebc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f174fe9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f174fea4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f174fea5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f179b93ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f17a0985609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f17a0750353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f174ebc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f174fe9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f174fea4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f174fea5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f179b93ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f17a0985609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f17a0750353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f174ebc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f174fb29119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f179b93ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f17a0985609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f17a0750353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72219df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7222cb8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7222cbda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7222cbedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f726e757e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f727379e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7273569353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72219df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7222cb8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7222cbda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7222cbedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f726e757e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f727379e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7273569353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72219df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f7222942119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f726e757e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f727379e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f7273569353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f691d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1f6a4b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1f6a4b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea1189e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1f6a4b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea12b77c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea12b7ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea12b7ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1fb5f51e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #4: + 0xd3e95 (0x7fea5e616e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1fbaf98609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1fbad63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #5: + 0x8609 (0x7fea6365d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fea63428353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f691d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1f6a4b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1f6a4b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1f6a4b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea1189e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea12b77c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea12b7ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea12b7ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fea5e616e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fea6365d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fea63428353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:frame #4: + 0xd3e95 (0x7f1fb5f51e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea1189e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fea12801119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fea5e616e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fea6365d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fea63428353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #5: + 0x8609 (0x7f1fbaf98609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1fbad63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f691d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]: -[default7]:frame #1: + 0xe32119 (0x7f1f6a13c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f1fb5f51e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f1fbaf98609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f1fbad63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c534e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c547bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c547c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c547c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f1ca025be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f1ca52a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f1ca506d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c534e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c547bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c547c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c547c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f1ca025be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f1ca52a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f1ca506d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c534e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f1c54446119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f1ca025be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f1ca52a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f1ca506d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc5c7dd6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc5c90afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc5c90b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc5c90b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fc614b4ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fc619b95609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fc619960353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc5c7dd6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc5c90afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc5c90b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc5c90b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fc614b4ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fc619b95609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fc619960353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc5c7dd6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fc5c8d39119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fc614b4ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fc619b95609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fc619960353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc3384a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdc34b23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdc34b28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdc34b29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fdc805c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fdc85609609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fdc853d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc3384a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdc34b23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdc34b28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdc34b29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fdc805c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fdc85609609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fdc853d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc3384a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fdc347ad119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fdc805c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fdc85609609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fdc853d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dc9856897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0dcab2fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0dcab34a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0dcab35dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f0e165cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0e1b615609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0e1b3e0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dc9856897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0dcab2fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0dcab34a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0dcab35dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f0e165cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0e1b615609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0e1b3e0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dc9856897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f0dca7b9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f0e165cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f0e1b615609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f0e1b3e0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f668b1c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f668c49bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f668c4a0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f668c4a1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f66d7f3ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f66dcf81609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f66dcd4c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f668b1c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f668c49bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f668c4a0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f668c4a1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f66d7f3ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f66dcf81609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f66dcd4c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f668b1c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f668c125119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f66d7f3ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f66dcf81609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f66dcd4c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00f9ce5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f00fafbec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f00fafc3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f00fafc4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0bd5e46897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #4: + 0xd3e95 (0x7f0146a5de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f014baa4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e74a03897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e75cdcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0bd711fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e75ce1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0bd7124a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e75ce2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0bd7125dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f0c22bbee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #4: + 0xd3e95 (0x7f4ec177be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f0c27c05609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #5: + 0x8609 (0x7f4ec67c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f4ec658d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #6: clone + 0x43 (0x7f014b86f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #6: clone + 0x43 (0x7f0c279d0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]: -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e74a03897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e75cdcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e75ce1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0bd5e46897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0bd711fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e75ce2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4ec177be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0bd7124a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0bd7125dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #5: + 0x8609 (0x7f4ec67c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #4: + 0xd3e95 (0x7f0c22bbee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f0c27c05609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f4ec658d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:frame #6: clone + 0x43 (0x7f0c279d0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00f9ce5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4e74a03897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0bd5e46897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f4e75966119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f4ec177be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f00fafbec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f00fafc3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f00fafc4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: + 0xe32119 (0x7f0bd6da9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f0c22bbee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #4: + 0xd3e95 (0x7f0146a5de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f014baa4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #3: + 0x8609 (0x7f0c27c05609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f0c279d0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #6: clone + 0x43 (0x7f014b86f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:frame #3: + 0x8609 (0x7f4ec67c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f4ec658d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]: -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00f9ce5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f00fac48119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f0146a5de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f014baa4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f014b86f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6f19cf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6f2ca8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6f2cada80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6f2caedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa73e747e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa74378e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa743559353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6f19cf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6f2ca8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6f2cada80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6f2caedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa73e747e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa74378e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa743559353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6f19cf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fa6f2932119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fa73e747e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fa74378e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fa743559353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47c9f08897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47cb1e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47cb1e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47cb1e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4816c80e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f481bcc7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f481ba92353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47c9f08897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47cb1e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47cb1e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47cb1e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4816c80e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f481bcc7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f481ba92353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47c9f08897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f47cae6b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f4816c80e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f481bcc7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f481ba92353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f90ac9f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f90adccdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f90adcd2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f90adcd3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f90f976ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f90fe7b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f90fe57e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f90ac9f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f90adccdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f90adcd2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f90adcd3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f90f976ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f90fe7b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f90fe57e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f90ac9f4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f90ad957119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f90f976ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f90fe7b3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f90fe57e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24b4054897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24b532dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24b5332a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24b5333dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f2500dcce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f2505e13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f2505bde353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24b4054897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24b532dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24b5332a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24b5333dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f2500dcce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f2505e13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f2505bde353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24b4054897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f24b4fb7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f2500dcce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f2505e13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f2505bde353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd350e16897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd3520efc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd3520f4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd3520f5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd39db8ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd3a2bd5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd3a29a0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd350e16897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd3520efc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd3520f4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd3520f5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd39db8ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd3a2bd5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd3a29a0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd350e16897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fd351d79119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fd39db8ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fd3a2bd5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fd3a29a0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f40ebc18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f40ecef1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f40ecef6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f40ecef7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4138990e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f413d9d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f413d7a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f40ebc18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f40ecef1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f40ecef6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f40ecef7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4138990e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f413d9d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f413d7a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f40ebc18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f40ecb7b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f4138990e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f413d9d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f413d7a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f331f38d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3320666c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f332066ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f332066cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f336c105e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f337114c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3370f17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f331f38d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3320666c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f332066ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f332066cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f336c105e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f337114c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3370f17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f331f38d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f33202f0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f336c105e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f337114c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f3370f17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14afc27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f14b0f00c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f14b0f05a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f14b0f06dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f14fc99fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f15019e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f15017b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14afc27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f14b0f00c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f14b0f05a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f14b0f06dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f14fc99fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f15019e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f15017b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14afc27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f14b0b8a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f14fc99fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f15019e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f15017b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04d7914897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04d8bedc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04d8bf2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04d8bf3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f052468ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f05296d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f052949e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04d7914897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04d8bedc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04d8bf2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04d8bf3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f052468ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f05296d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f052949e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04d7914897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f04d8877119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f052468ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f05296d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f052949e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61d7808897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f61d8ae1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f61d8ae6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f61d8ae7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6224580e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f62295c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6229392353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61d7808897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f61d8ae1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f61d8ae6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f61d8ae7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6224580e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f62295c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6229392353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f61d7808897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f61d876b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f6224580e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f62295c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f6229392353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9655579897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9656852c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9656857a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9656858dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f96a22f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f96a7338609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f96a7103353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9655579897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9656852c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9656857a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9656858dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f96a22f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f96a7338609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f96a7103353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9655579897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f96564dc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f96a22f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f96a7338609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f96a7103353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3154de1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f31560bac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f31560bfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f31560c0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f31a1b59e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f31a6ba0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f31a696b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3154de1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f31560bac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f31560bfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f31560c0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f31a1b59e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f31a6ba0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f31a696b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3154de1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f3155d44119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f31a1b59e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f31a6ba0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f31a696b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3be4af2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3be5dcbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3be5dd0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3be5dd1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3c3186ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3c368b1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3c3667c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3be4af2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3be5dcbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3be5dd0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3be5dd1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3c3186ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3c368b1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3c3667c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3be4af2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f3be5a55119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f3c3186ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f3c368b1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f3c3667c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe84a467897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe84b740c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe84b745a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe84b746dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe8971dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe89c226609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe89bff1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe84a467897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe84b740c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe84b745a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe84b746dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe8971dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe89c226609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe89bff1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe84a467897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fe84b3ca119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fe8971dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fe89c226609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fe89bff1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5bbaf96897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5bbc26fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5bbc274a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5bbc275dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5c07d0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5c0cd55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f5c0cb20353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5bbaf96897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5bbc26fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5bbc274a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5bbc275dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5c07d0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5c0cd55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f5c0cb20353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5bbaf96897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f5bbbef9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f5c07d0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f5c0cd55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f5c0cb20353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f331ecc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f331ff9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f331ffa4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f331ffa5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f336ba3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3370a85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3370850353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f331ecc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f331ff9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f331ffa4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f331ffa5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f336ba3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3370a85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3370850353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f331ecc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f331fc29119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f336ba3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f3370a85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f3370850353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94a484a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94a5b23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94a5b28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94a5b29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f94f15c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f94f6609609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f94f63d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94a484a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94a5b23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94a5b28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94a5b29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f94f15c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f94f6609609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f94f63d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94a484a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f94a57ad119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f94f15c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f94f6609609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f94f63d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff0b41ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff0b54c7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff0b54cca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff0b54cddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff100f66e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff105fad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff105d78353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff0b41ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff0b54c7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff0b54cca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff0b54cddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff100f66e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff105fad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff105d78353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff0b41ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ff0b5151119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff100f66e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff105fad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ff105d78353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f61a2a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8f62d03c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8f62d08a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8f62d09dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8fae7a2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8fb37e9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8fb35b4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13a03b5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f13a168ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f13a1693a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f13a1694dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f13ed12de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f61a2a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #5: + 0x8609 (0x7f13f2174609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8f62d03c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #6: clone + 0x43 (0x7f13f1f3f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13a03b5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f13a168ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8f62d08a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8f62d09dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f13a1693a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f13a1694dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8fae7a2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8fb37e9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8fb35b4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #4: + 0xd3e95 (0x7f13ed12de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f13f2174609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f13f1f3f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13a03b5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]: -[default7]:frame #1: + 0xe32119 (0x7f13a1318119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8f61a2a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f8f6298d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f8fae7a2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f8fb37e9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f8fb35b4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #2: + 0xd3e95 (0x7f13ed12de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]: -[default7]:frame #3: + 0x8609 (0x7f13f2174609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f13f1f3f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f92dc310897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f92dd5e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f92dd5eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f92dd5efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9329088e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f932e0cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f932de9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f92dc310897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f92dd5e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f92dd5eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f92dd5efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9329088e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f932e0cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f932de9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f92dc310897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f92dd273119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f9329088e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f932e0cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f932de9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f9f232897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1fa050bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1fa0510a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1fa0511dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f1febfaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f1ff0ff1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f1ff0dbc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f9f232897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1fa050bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1fa0510a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1fa0511dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f1febfaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f1ff0ff1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f1ff0dbc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f9f232897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f1fa0195119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f1febfaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f1ff0ff1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f1ff0dbc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f10f3f4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f10f5227c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f10f522ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f10f522ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1140cc6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1145d0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1145ad8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f10f3f4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f10f5227c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f10f522ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f10f522ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1140cc6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1145d0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1145ad8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f10f3f4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f10f4eb1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f1140cc6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f1145d0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f1145ad8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -W0703 03:43:38.348000 140553412921152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 904343 closing signal SIGTERM -W0703 03:43:38.348000 140553412921152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 904344 closing signal SIGTERM -W0703 03:43:38.348000 140553412921152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 904345 closing signal SIGTERM -W0703 03:43:38.348000 140553412921152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 904346 closing signal SIGTERM -W0703 03:43:38.348000 140553412921152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 904347 closing signal SIGTERM -W0703 03:43:38.348000 140553412921152 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 904348 closing signal SIGTERM -W0703 03:43:38.425000 140075254286144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 891103 closing signal SIGTERM -W0703 03:43:38.425000 140075254286144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 891104 closing signal SIGTERM -W0703 03:43:38.425000 140075254286144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 891105 closing signal SIGTERM -W0703 03:43:38.425000 140075254286144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 891106 closing signal SIGTERM -W0703 03:43:38.425000 140075254286144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 891107 closing signal SIGTERM -W0703 03:43:38.425000 140075254286144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 891108 closing signal SIGTERM -W0703 03:43:38.425000 140075254286144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 891109 closing signal SIGTERM -E0703 03:43:39.746000 140075254286144 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 891102) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:43:38 - host : ip-26-0-161-103.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 891102) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 891102 -============================================================ -E0703 03:43:39.951000 140553412921152 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 904342) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:43:38 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 904349) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904349 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:43:38 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 904342) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 904342 -============================================================ -srun: error: ip-26-0-161-103: task 1: Exited with exit code 1 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -W0703 03:43:43.411000 139741875156800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1166831 closing signal SIGTERM -W0703 03:43:43.428000 140503982675776 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 687049 closing signal SIGTERM -W0703 03:43:43.428000 140503982675776 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 687051 closing signal SIGTERM -W0703 03:43:43.429000 140503982675776 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 687052 closing signal SIGTERM -W0703 03:43:43.429000 140503982675776 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 687053 closing signal SIGTERM -W0703 03:43:43.464000 140161389905728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3916015 closing signal SIGTERM -W0703 03:43:43.464000 140161389905728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3916016 closing signal SIGTERM -W0703 03:43:43.464000 140161389905728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3916017 closing signal SIGTERM -W0703 03:43:43.464000 140161389905728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3916018 closing signal SIGTERM -W0703 03:43:43.464000 140161389905728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3916019 closing signal SIGTERM -W0703 03:43:43.464000 140161389905728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3916020 closing signal SIGTERM -W0703 03:43:43.464000 140161389905728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3916021 closing signal SIGTERM -W0703 03:43:43.467000 139724110952256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1442446 closing signal SIGTERM -W0703 03:43:43.467000 139724110952256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1442447 closing signal SIGTERM -W0703 03:43:43.467000 139724110952256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1442448 closing signal SIGTERM -W0703 03:43:43.467000 139724110952256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1442449 closing signal SIGTERM -W0703 03:43:43.467000 139724110952256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1442450 closing signal SIGTERM -W0703 03:43:43.468000 139724110952256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1442451 closing signal SIGTERM -W0703 03:43:43.468000 139724110952256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1442452 closing signal SIGTERM -E0703 03:43:43.636000 140623480780608 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 25636) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:43:43.654000 139741875156800 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1166825) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:43:43 - host : ip-26-0-166-125.ec2.internal - rank : 33 (local_rank: 1) - exitcode : -6 (pid: 25637) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25637 -[2]: - time : 2024-07-03_03:43:43 - host : ip-26-0-166-125.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 25638) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25638 -[3]: - time : 2024-07-03_03:43:43 - host : ip-26-0-166-125.ec2.internal - rank : 35 (local_rank: 3) - exitcode : -6 (pid: 25639) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25639 -[4]: - time : 2024-07-03_03:43:43 - host : ip-26-0-166-125.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 25640) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25640 -[5]: - time : 2024-07-03_03:43:43 - host : ip-26-0-166-125.ec2.internal - rank : 37 (local_rank: 5) - exitcode : -6 (pid: 25641) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25641 -[6]: - time : 2024-07-03_03:43:43 - host : ip-26-0-166-125.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 25642) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25642 -[7]: - time : 2024-07-03_03:43:43 - host : ip-26-0-166-125.ec2.internal - rank : 39 (local_rank: 7) - exitcode : -6 (pid: 25643) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25643 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:43:43 - host : ip-26-0-166-125.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 25636) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25636 -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 1166826) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1166826 -[2]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 1166827) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1166827 -[3]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 1166828) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1166828 -[4]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : -6 (pid: 1166829) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1166829 -[5]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 1166830) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1166830 -[6]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 1166832) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1166832 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 1166825) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1166825 -============================================================ -srun: error: ip-26-0-166-125: task 4: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 0: Exited with exit code 1 -E0703 03:43:44.979000 140161389905728 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3916014) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:43:44.993000 140161389905728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3915945_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:43:45.026000 140161389905728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3915945_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:43:45.034000 140161389905728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3915945_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:43:43 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 3916014) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3916014 -============================================================ -E0703 03:43:45.084000 140503982675776 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 687047) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:43:45.099000 140503982675776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_686978_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:43:45.131000 140503982675776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_686978_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:43:45.152000 140503982675776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_686978_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-138.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 687048) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 687048 -[2]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-138.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 687050) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 687050 -[3]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-138.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 687054) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 687054 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-138.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 687047) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 687047 -============================================================ -E0703 03:43:45.567000 139724110952256 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1442445) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:43:45.582000 139724110952256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1442376_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:43:45.614000 139724110952256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1442376_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:43:45.623000 139724110952256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1442376_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:43:43 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 1442445) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1442445 -============================================================ -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-161-138: task 2: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -W0703 03:43:48.114000 139834874345216 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3786865_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 03:43:48.649000 139840535078720 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3786934) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:43:48.661000 139840535078720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3786865_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:43:48.690000 139840535078720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3786865_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:43:48.727000 139840535078720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3786865_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:43:48 - host : ip-26-0-171-102.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 3786935) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3786935 -[2]: - time : 2024-07-03_03:43:48 - host : ip-26-0-171-102.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 3786936) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3786936 -[3]: - time : 2024-07-03_03:43:48 - host : ip-26-0-171-102.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 3786937) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3786937 -[4]: - time : 2024-07-03_03:43:48 - host : ip-26-0-171-102.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 3786938) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3786938 -[5]: - time : 2024-07-03_03:43:48 - host : ip-26-0-171-102.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 3786939) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3786939 -[6]: - time : 2024-07-03_03:43:48 - host : ip-26-0-171-102.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 3786940) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3786940 -[7]: - time : 2024-07-03_03:43:48 - host : ip-26-0-171-102.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 3786941) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3786941 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:43:48 - host : ip-26-0-171-102.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 3786934) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3786934 -============================================================ -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-4/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/bench.slurm deleted file mode 100644 index 42d1f084f2e5e656283a5c41309134b044026aac..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/config.yaml deleted file mode 100644 index 3a7a784e28da4a9681449904ab937521d8986429..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 2 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 512 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/log.out deleted file mode 100644 index 394aab519fa07367b8cbace319f9d1ef82e9ac5e..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/log.out +++ /dev/null @@ -1,5871 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:21:20 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:21:22.891000 139880927135552 torch/distributed/run.py:757] -W0703 09:21:22.891000 139880927135552 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.891000 139880927135552 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:21:22.891000 139880927135552 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.894000 140407726884672 torch/distributed/run.py:757] -W0703 09:21:22.894000 140407726884672 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.894000 140407726884672 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:21:22.894000 140407726884672 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.893000 140297246373696 torch/distributed/run.py:757] -W0703 09:21:22.893000 140297246373696 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.893000 140297246373696 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:21:22.893000 140297246373696 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.896000 140259772696384 torch/distributed/run.py:757] -W0703 09:21:22.896000 140259772696384 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.896000 140259772696384 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:21:22.896000 140259772696384 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.895000 139730669508416 torch/distributed/run.py:757] -W0703 09:21:22.895000 139730669508416 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.895000 139730669508416 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:21:22.895000 139730669508416 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.898000 140329572415296 torch/distributed/run.py:757] -W0703 09:21:22.898000 140329572415296 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.898000 140329572415296 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:21:22.898000 140329572415296 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.898000 140100962318144 torch/distributed/run.py:757] -W0703 09:21:22.898000 140100962318144 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.898000 140100962318144 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:21:22.898000 140100962318144 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.978000 139828760536896 torch/distributed/run.py:757] -W0703 09:21:22.978000 139828760536896 torch/distributed/run.py:757] ***************************************** -W0703 09:21:22.978000 139828760536896 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:21:22.978000 139828760536896 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:21:42 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=2, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=32, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=512, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=2, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512')), -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 09:21:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=26|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=26|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=26|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=29|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=29|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=29|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=25|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=25|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=25|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=30|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=30|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=30|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=24|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=24|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=31|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=31|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=31|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=24|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=27|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=27|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=27|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=28|ip-26-0-172-73]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=28|ip-26-0-172-73]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=28|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: No checkpoint path provided. -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-226]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-226]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=22|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=22|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=22|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=19|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=19|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=19|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=6|ip-26-0-169-132]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=1|ip-26-0-169-132]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=1|ip-26-0-169-132]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=1|ip-26-0-169-132]: No checkpoint path provided. -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=6|ip-26-0-169-132]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=6|ip-26-0-169-132]: No checkpoint path provided. -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=3|ip-26-0-169-132]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=3|ip-26-0-169-132]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=3|ip-26-0-169-132]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=16|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=16|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=16|ip-26-0-172-57]: No checkpoint path provided. -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=21|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=21|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=21|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=7|ip-26-0-169-132]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=7|ip-26-0-169-132]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=7|ip-26-0-169-132]: No checkpoint path provided. -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=17|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=17|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=17|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=0|ip-26-0-169-132]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=0|ip-26-0-169-132]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=0|ip-26-0-169-132]: No checkpoint path provided. -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=2|ip-26-0-169-132]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=2|ip-26-0-169-132]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=2|ip-26-0-169-132]: No checkpoint path provided. -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=18|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: No checkpoint path provided. -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=18|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=18|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=20|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=20|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=20|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=4|ip-26-0-169-132]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=4|ip-26-0-169-132]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=4|ip-26-0-169-132]: No checkpoint path provided. -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=5|ip-26-0-169-132]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=5|ip-26-0-169-132]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=5|ip-26-0-169-132]: No checkpoint path provided. -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=23|ip-26-0-172-57]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=23|ip-26-0-172-57]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=23|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: No checkpoint path provided. -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=27|ip-26-0-168-238]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=27|ip-26-0-168-238]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=31|ip-26-0-168-238]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=31|ip-26-0-168-238]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=25|ip-26-0-168-238]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=25|ip-26-0-168-238]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=31|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=25|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=27|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=30|ip-26-0-168-238]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=30|ip-26-0-168-238]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=30|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=29|ip-26-0-168-238]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=29|ip-26-0-168-238]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=29|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=28|ip-26-0-168-238]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=28|ip-26-0-168-238]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=28|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=26|ip-26-0-168-238]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=26|ip-26-0-168-238]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=26|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=24|ip-26-0-168-238]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=24|ip-26-0-168-238]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:22:01 [INFO|DP=0|PP=0|TP=24|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 09:22:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:22:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:22:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 09:22:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:22:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 09:22:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:22:04 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 09:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 09:22:05.126011 | mbs: 512 | grad_accum: 2 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default6]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=30|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=5|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=4|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=18|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=20|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=26|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=21|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=19|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=20|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=18|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=17|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=29|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=25|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=24|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=31|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=16|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=22|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=28|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=27|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=23|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=22|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=19|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=1|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=6|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=3|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=16|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=21|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=0|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=17|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=7|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=2|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=23|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=25|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=31|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=27|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=30|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=28|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=26|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=24|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:22:05 [WARNING|DP=0|PP=1|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:22:05 [WARNING|DP=0|PP=0|TP=29|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank20]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank20]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.73 GiB is free. Including non-PyTorch memory, this process has 71.58 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank18]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank16]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default0]:[rank16]: return row_linear( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.73 GiB is free. Including non-PyTorch memory, this process has 71.58 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank13]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank13]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.73 GiB is free. Including non-PyTorch memory, this process has 71.58 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank17]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank19]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank17]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return row_linear( -[default3]:[rank19]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.63 GiB is free. Including non-PyTorch memory, this process has 71.69 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.63 GiB is free. Including non-PyTorch memory, this process has 71.69 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank21]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank21]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.63 GiB is free. Including non-PyTorch memory, this process has 71.69 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank23]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank23]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.63 GiB is free. Including non-PyTorch memory, this process has 71.69 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank22]: hidden_states = self.mlp(hidden[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -_states=hidden_states)["hidden_states"] -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank22]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.73 GiB is free. Including non-PyTorch memory, this process has 71.58 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated mem[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -ory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank15]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank15]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.73 GiB is free. Including non-PyTorch memory, this process has 71.58 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank9]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.73 GiB is free. Including non-PyTorch memory, this process has 71.58 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: output = model(**micro_batch) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank8]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default2]:[rank10]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return row_linear( -[default0]:[rank8]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.63 GiB is free. Including non-PyTorch memory, this process has 71.69 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: Traceback (most recent call last): -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwarg[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -s) -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank24]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default0]:[rank24]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return row_linear( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank10]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank10]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.73 GiB is free. Including non-PyTorch memory, this process has 71.58 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.63 GiB is free. Including non-PyTorch memory, this process has 71.69 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.63 GiB is free. Including non-PyTorch memory, this process has 71.69 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default6]:[rank30]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: trainer.train(dataloader) -[default3]:[rank27]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.99 GiB is free. Including non-PyTorch memory, this process has 71.33 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank30]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.80 GiB is free. Including non-PyTorch memory, this process has 71.51 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank31]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank31]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.99 GiB is free. Including non-PyTorch memory, this process has 71.33 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank29]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank29]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return row_linear( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.99 GiB is free. Including non-PyTorch memory, this process has 71.33 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: Traceback (most recent call last): -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: output = model(**micro_batch) -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: output = model(**micro_batch) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default4]:[rank28]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: sharded_logits = self.model( -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default4]:[rank28]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank25]: return row_linear( -[default4]:[rank28]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.99 GiB is free. Including non-PyTorch memory, this process has 71.33 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default2]:[rank26]: return row_linear( -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.80 GiB is free. Including non-PyTorch memory, this process has 71.51 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.80 GiB is free. Including non-PyTorch memory, this process has 71.51 GiB memory in use. Of the allocated memory 61.56 GiB is allocated by PyTorch, and 230.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -W0703 09:22:29.299000 139828760536896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1928491 closing signal SIGTERM -W0703 09:22:29.299000 139828760536896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1928492 closing signal SIGTERM -W0703 09:22:29.299000 139828760536896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1928493 closing signal SIGTERM -W0703 09:22:29.299000 139828760536896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1928494 closing signal SIGTERM -W0703 09:22:29.299000 139828760536896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1928495 closing signal SIGTERM -W0703 09:22:29.299000 139828760536896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1928496 closing signal SIGTERM -W0703 09:22:29.300000 139828760536896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1928497 closing signal SIGTERM -E0703 09:22:31.223000 139828760536896 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 7 (pid: 1928498) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:22:29 - host : ip-26-0-168-238.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1928498) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-168-238: task 3: Exited with exit code 1 -E0703 09:22:34.414000 140407726884672 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3285905) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:22:34.418000 140297246373696 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 839241) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-226.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 3285906) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-226.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 3285907) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-226.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 3285908) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-226.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 3285909) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-226.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 3285910) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-226.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 3285911) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-226.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 3285912) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-226.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 3285905) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-220.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 839242) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-220.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 839243) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-220.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 839244) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-220.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 839245) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-220.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 839246) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-220.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 839247) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-220.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 839248) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:22:34 - host : ip-26-0-163-220.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 839241) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 1: Exited with exit code 1 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 7] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 7] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 2] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 2] ProcessGroupNCCL preparing to dump debug info. -[default7]:[rank7]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 7] [PG 2 Rank 7] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 6] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 6] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank2]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 2] [PG 2 Rank 2] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -[default6]:[rank6]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 6] [PG 2 Rank 6] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 1] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 1] ProcessGroupNCCL preparing to dump debug info. -[default1]:[rank1]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 1] [PG 2 Rank 1] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 3] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 3] ProcessGroupNCCL preparing to dump debug info. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 5] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default3]:[rank3]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 3] [PG 2 Rank 3] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 5] ProcessGroupNCCL preparing to dump debug info. -[default5]:[rank5]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 5] [PG 2 Rank 5] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 4] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 4] ProcessGroupNCCL preparing to dump debug info. -[default4]:[rank4]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 4] [PG 2 Rank 4] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 0] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 0] ProcessGroupNCCL preparing to dump debug info. -[default0]:[rank0]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 0] [PG 2 Rank 0] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank56]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank56]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4733245897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank56]: frame #1: + 0x5b3a23e (0x7f476cd6223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f476cd5cc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f476cd5cf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f476cd5dfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f476cd12371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f476cd12371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f476cd12371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f476cd12371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f473451f189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4734526610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4734545978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #12: + 0x5adc309 (0x7f476cd04309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #13: + 0x5ae6f10 (0x7f476cd0ef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #14: + 0x5ae6fa5 (0x7f476cd0efa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #15: + 0x5124446 (0x7f476c34c446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #16: + 0x1acf4b8 (0x7f4768cf74b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #17: + 0x5aee004 (0x7f476cd16004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #18: + 0x5af36b5 (0x7f476cd1b6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #19: + 0xd2631e (0x7f477f90531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #20: + 0x47def4 (0x7f477f05cef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #21: + 0x1445a6 (0x56119b6265a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56119b61fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #23: + 0x150866 (0x56119b632866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56119b61b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56119b626a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #26: PyObject_Call + 0xbc (0x56119b632f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56119b6192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56119b626a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56119b6178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #30: + 0x150582 (0x56119b632582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56119b6178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #32: + 0x150582 (0x56119b632582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56119b6178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #34: + 0x150582 (0x56119b632582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56119b6178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56119b61ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56119b630c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #38: + 0x211239 (0x56119b6f3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56119b61fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56119b61b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56119b626a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56119b616c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56119b626a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56119b6178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #45: + 0x150582 (0x56119b632582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #46: PyObject_Call + 0xbc (0x56119b632f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56119b6192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #48: + 0x150582 (0x56119b632582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #49: PyObject_Call + 0xbc (0x56119b632f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56119b6192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56119b626a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56119b61f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56119b630c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #54: + 0x211239 (0x56119b6f3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #55: PyObject_Call + 0x207 (0x56119b633067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56119b6192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #57: + 0x150582 (0x56119b632582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56119b6178fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #59: + 0x150582 (0x56119b632582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #60: PyObject_Call + 0xbc (0x56119b632f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56119b6192b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #62: + 0x150582 (0x56119b632582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #63: PyObject_Call + 0xbc (0x56119b632f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank46]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank46]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb55e576897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank46]: frame #1: + 0x5b3a23e (0x7fb59809323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb59808dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb59808df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb59808efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb598043371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb598043371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb598043371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb598043371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb55f850189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb55f857610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb55f876978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #12: + 0x5adc309 (0x7fb598035309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #13: + 0x5ae6f10 (0x7fb59803ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #14: + 0x5ae6fa5 (0x7fb59803ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #15: + 0x5124446 (0x7fb59767d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #16: + 0x1acf4b8 (0x7fb5940284b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #17: + 0x5aee004 (0x7fb598047004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #18: + 0x5af36b5 (0x7fb59804c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #19: + 0xd2631e (0x7fb5aac3631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #20: + 0x47def4 (0x7fb5aa38def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #21: + 0x1445a6 (0x555c54bce5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555c54bc7a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #23: + 0x150866 (0x555c54bda866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555c54bc3142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555c54bcea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #26: PyObject_Call + 0xbc (0x555c54bdaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555c54bc12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555c54bcea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555c54bbf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #30: + 0x150582 (0x555c54bda582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555c54bbf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #32: + 0x150582 (0x555c54bda582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555c54bbf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #34: + 0x150582 (0x555c54bda582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555c54bbf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555c54bc6f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555c54bd8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #38: + 0x211239 (0x555c54c9b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555c54bc7a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555c54bc33e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555c54bcea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555c54bbec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555c54bcea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555c54bbf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #45: + 0x150582 (0x555c54bda582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #46: PyObject_Call + 0xbc (0x555c54bdaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555c54bc12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #48: + 0x150582 (0x555c54bda582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #49: PyObject_Call + 0xbc (0x555c54bdaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555c54bc12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555c54bcea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555c54bc7007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555c54bd8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #54: + 0x211239 (0x555c54c9b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #55: PyObject_Call + 0x207 (0x555c54bdb067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555c54bc12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #57: + 0x150582 (0x555c54bda582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555c54bbf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #59: + 0x150582 (0x555c54bda582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #60: PyObject_Call + 0xbc (0x555c54bdaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555c54bc12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #62: + 0x150582 (0x555c54bda582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #63: PyObject_Call + 0xbc (0x555c54bdaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank61]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9f3d25897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]: frame #1: + 0x5b3a23e (0x7fba2d84223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fba2d83cc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fba2d83cf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fba2d83dfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fba2d7f2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fba2d7f2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fba2d7f2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fba2d7f2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb9f4fff189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb9f5006610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb9f5025978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #12: + 0x5adc309 (0x7fba2d7e4309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #13: + 0x5ae6f10 (0x7fba2d7eef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #14: + 0x5ae6fa5 (0x7fba2d7eefa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #15: + 0x5124446 (0x7fba2ce2c446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #16: + 0x1acf4b8 (0x7fba297d74b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #17: + 0x5aee004 (0x7fba2d7f6004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #18: + 0x5af36b5 (0x7fba2d7fb6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #19: + 0xd2631e (0x7fba403e531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #20: + 0x47def4 (0x7fba3fb3cef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #21: + 0x1445a6 (0x555ad522b5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555ad5224a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #23: + 0x150866 (0x555ad5237866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555ad5220142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555ad522ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #26: PyObject_Call + 0xbc (0x555ad5237f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555ad521e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555ad522ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555ad521c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #30: + 0x150582 (0x555ad5237582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555ad521c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #32: + 0x150582 (0x555ad5237582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555ad521c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #34: + 0x150582 (0x555ad5237582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555ad521c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555ad5223f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555ad5235c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #38: + 0x211239 (0x555ad52f8239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555ad5224a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555ad52203e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555ad522ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555ad521bc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555ad522ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555ad521c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #45: + 0x150582 (0x555ad5237582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #46: PyObject_Call + 0xbc (0x555ad5237f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555ad521e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #48: + 0x150582 (0x555ad5237582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #49: PyObject_Call + 0xbc (0x555ad5237f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555ad521e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555ad522ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555ad5224007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555ad5235c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #54: + 0x211239 (0x555ad52f8239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #55: PyObject_Call + 0x207 (0x555ad5238067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555ad521e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #57: + 0x150582 (0x555ad5237582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555ad521c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #59: + 0x150582 (0x555ad5237582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #60: PyObject_Call + 0xbc (0x555ad5237f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555ad521e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #62: + 0x150582 (0x555ad5237582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #63: PyObject_Call + 0xbc (0x555ad5237f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank40]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank40]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7adb0e0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank40]: frame #1: + 0x5b3a23e (0x7f7b14bfd23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f7b14bf7c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7b14bf7f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7b14bf8fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7b14bad371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7b14bad371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7b14bad371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7b14bad371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f7adc3ba189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f7adc3c1610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f7adc3e0978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #12: + 0x5adc309 (0x7f7b14b9f309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #13: + 0x5ae6f10 (0x7f7b14ba9f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #14: + 0x5ae6fa5 (0x7f7b14ba9fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #15: + 0x5124446 (0x7f7b141e7446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #16: + 0x1acf4b8 (0x7f7b10b924b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #17: + 0x5aee004 (0x7f7b14bb1004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #18: + 0x5af36b5 (0x7f7b14bb66b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #19: + 0xd2631e (0x7f7b277a031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #20: + 0x47def4 (0x7f7b26ef7ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #21: + 0x1445a6 (0x562f27d0c5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562f27d05a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #23: + 0x150866 (0x562f27d18866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x562f27d01142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562f27d0ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #26: PyObject_Call + 0xbc (0x562f27d18f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x562f27cff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562f27d0ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x562f27cfd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #30: + 0x150582 (0x562f27d18582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x562f27cfd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #32: + 0x150582 (0x562f27d18582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x562f27cfd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #34: + 0x150582 (0x562f27d18582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x562f27cfd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562f27d04f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562f27d16c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #38: + 0x211239 (0x562f27dd9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562f27d05a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x562f27d013e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562f27d0ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562f27cfcc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562f27d0ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x562f27cfd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #45: + 0x150582 (0x562f27d18582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #46: PyObject_Call + 0xbc (0x562f27d18f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x562f27cff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #48: + 0x150582 (0x562f27d18582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #49: PyObject_Call + 0xbc (0x562f27d18f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x562f27cff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562f27d0ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562f27d05007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562f27d16c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #54: + 0x211239 (0x562f27dd9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #55: PyObject_Call + 0x207 (0x562f27d19067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x562f27cff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #57: + 0x150582 (0x562f27d18582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x562f27cfd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #59: + 0x150582 (0x562f27d18582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #60: PyObject_Call + 0xbc (0x562f27d18f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x562f27cff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #62: + 0x150582 (0x562f27d18582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #63: PyObject_Call + 0xbc (0x562f27d18f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank32]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank32]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1546867897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank32]: frame #1: + 0x5b3a23e (0x7f158038423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f158037ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f158037ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f158037ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1580334371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1580334371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1580334371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1580334371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f1547b41189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f1547b48610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f1547b67978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #12: + 0x5adc309 (0x7f1580326309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #13: + 0x5ae6f10 (0x7f1580330f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #14: + 0x5ae6fa5 (0x7f1580330fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #15: + 0x5124446 (0x7f157f96e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #16: + 0x1acf4b8 (0x7f157c3194b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #17: + 0x5aee004 (0x7f1580338004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #18: + 0x5af36b5 (0x7f158033d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #19: + 0xd2631e (0x7f1592f2731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #20: + 0x47def4 (0x7f159267eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #21: + 0x1445a6 (0x5556d9b045a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5556d9afda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #23: + 0x150866 (0x5556d9b10866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5556d9af9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5556d9b04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #26: PyObject_Call + 0xbc (0x5556d9b10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5556d9af72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5556d9b04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5556d9af58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #30: + 0x150582 (0x5556d9b10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5556d9af58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #32: + 0x150582 (0x5556d9b10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5556d9af58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #34: + 0x150582 (0x5556d9b10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5556d9af58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5556d9afcf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5556d9b0ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #38: + 0x211239 (0x5556d9bd1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5556d9afda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5556d9af93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5556d9b04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5556d9af4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5556d9b04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5556d9af58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #45: + 0x150582 (0x5556d9b10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #46: PyObject_Call + 0xbc (0x5556d9b10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5556d9af72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #48: + 0x150582 (0x5556d9b10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #49: PyObject_Call + 0xbc (0x5556d9b10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5556d9af72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5556d9b04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5556d9afd007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5556d9b0ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #54: + 0x211239 (0x5556d9bd1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #55: PyObject_Call + 0x207 (0x5556d9b11067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5556d9af72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #57: + 0x150582 (0x5556d9b10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5556d9af58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #59: + 0x150582 (0x5556d9b10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #60: PyObject_Call + 0xbc (0x5556d9b10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5556d9af72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #62: + 0x150582 (0x5556d9b10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #63: PyObject_Call + 0xbc (0x5556d9b10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.p[default6]:[rank62]: Traceback (most recent call last): -y", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tens[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -ors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: Traceback (most recent call last): -[default5]:[rank37]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank37]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0cc061b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: frame #1: + 0x5b3a23e (0x7f0cfa13823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: Traceback (most recent call last): -[default5]:[rank37]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0cfa132c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: trainer.train(dataloader) -[default5]:[rank37]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0cfa132f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0cfa133fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0cfa0e8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: trainer.train(dataloader) -[default5]:[rank37]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0cfa0e8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0cfa0e8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0cfa0e8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f0cc18f5189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0cc18fc610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0cc191b978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: frame #12: + 0x5adc309 (0x7f0cfa0da309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #13: + 0x5ae6f10 (0x7f0cfa0e4f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: frame #14: + 0x5ae6fa5 (0x7f0cfa0e4fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #15: + 0x5124446 (0x7f0cf9722446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #16: + 0x1acf4b8 (0x7f0cf60cd4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: trainer.train(dataloader) -[default5]:[rank37]: frame #17: + 0x5aee004 (0x7f0cfa0ec004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: frame #18: + 0x5af36b5 (0x7f0cfa0f16b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #19: + 0xd2631e (0x7f0d0ccdb31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #20: + 0x47def4 (0x7f0d0c432ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #21: + 0x1445a6 (0x5630f8d585a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5630f8d51a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #23: + 0x150866 (0x5630f8d64866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[def[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -ault5]:[rank37]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5630f8d4d142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5630f8d58a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #26: PyObject_Call + 0xbc (0x5630f8d64f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5630f8d4b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5630f8d58a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5630f8d498fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: frame #30: + 0x150582 (0x5630f8d64582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5630f8d498fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: frame #32: + 0x150582 (0x5630f8d64582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5630f8d498fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #34: + 0x150582 (0x5630f8d64582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5630f8d498fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5630f8d50f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5630f8d62c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: frame #38: + 0x211239 (0x5630f8e25239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5630f8d51a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5630f8d4d3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5630f8d58a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5630f8d48c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5630f8d58a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5630f8d498fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #45: + 0x150582 (0x5630f8d64582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #46: PyObject_Call + 0xbc (0x5630f8d64f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5630f8d4b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #48: + 0x150582 (0x5630f8d64582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #49: PyObject_Call + 0xbc (0x5630f8d64f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5630f8d4b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5630f8d58a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5630f8d51007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5630f8d62c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #54: + 0x211239 (0x5630f8e25239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #55: PyObject_Call + 0x207 (0x5630f8d65067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5630f8d4b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #57: + 0x150582 (0x5630f8d64582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5630f8d498fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #59: + 0x150582 (0x5630f8d64582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #60: PyObject_Call + 0xbc (0x5630f8d64f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5630f8d4b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: frame #62: + 0x150582 (0x5630f8d64582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #63: PyObject_Call + 0xbc (0x5630f8d64f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: output = model(**micro_batch) -[default5]:[rank37]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: output = model(**micro_batch) -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: sharded_logits = self.model( -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default1]:[rank57]: pipeline_state.run_communication() -[default6]:[rank62]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: dist.recv( -[default1]:[rank57]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank62]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default6]:[rank62]: return func(*args, **kwargs) -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank57]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank62]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank58]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe395182897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank57]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6ea4bf7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank62]: frame #1: + 0x5b3a23e (0x7f6ede71423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #1: + 0x5b3a23e (0x7fe3cec9f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6ede70ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6ede70ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21093b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank62]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6ede70ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #1: + 0x5b3a23e (0x7f2142ed023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fe3cec99c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2142ecac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fe3cec99f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6ede6c4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2142ecaf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6ede6c4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fe3cec9afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f2142ecbfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe3cec4f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6ede6c4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe3cec4f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe3cec4f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6ede6c4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2142e80371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe3cec4f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6ea5ed1189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2142e80371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fe39645c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6ea5ed8610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fe396463610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2142e80371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6ea5ef7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2142e80371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #12: + 0x5adc309 (0x7f6ede6b6309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f210a68d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fe396482978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #13: + 0x5ae6f10 (0x7f6ede6c0f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f210a694610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #12: + 0x5adc309 (0x7fe3cec41309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f210a6b3978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #13: + 0x5ae6f10 (0x7fe3cec4bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #12: + 0x5adc309 (0x7f2142e72309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #14: + 0x5ae6fa5 (0x7fe3cec4bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #13: + 0x5ae6f10 (0x7f2142e7cf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #14: + 0x5ae6fa5 (0x7f6ede6c0fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #15: + 0x5124446 (0x7fe3ce289446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #14: + 0x5ae6fa5 (0x7f2142e7cfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #16: + 0x1acf4b8 (0x7fe3cac344b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #15: + 0x5124446 (0x7f6eddcfe446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #17: + 0x5aee004 (0x7fe3cec53004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #16: + 0x1acf4b8 (0x7f6eda6a94b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #15: + 0x5124446 (0x7f21424ba446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #18: + 0x5af36b5 (0x7fe3cec586b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #16: + 0x1acf4b8 (0x7f213ee654b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #17: + 0x5aee004 (0x7f6ede6c8004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #17: + 0x5aee004 (0x7f2142e84004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #19: + 0xd2631e (0x7fe3e184231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #18: + 0x5af36b5 (0x7f6ede6cd6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #18: + 0x5af36b5 (0x7f2142e896b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #20: + 0x47def4 (0x7fe3e0f99ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #19: + 0xd2631e (0x7f2155a7331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #21: + 0x1445a6 (0x556c454715a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #20: + 0x47def4 (0x7f21551caef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #22: _PyObject_MakeTpCall + 0x26b (0x556c4546aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #21: + 0x1445a6 (0x5594d6a9b5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #23: + 0x150866 (0x556c4547d866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x556c45466142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #19: + 0xd2631e (0x7f6ef12b731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5594d6a94a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #25: _PyFunction_Vectorcall + 0x6c (0x556c45471a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #20: + 0x47def4 (0x7f6ef0a0eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #23: + 0x150866 (0x5594d6aa7866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #26: PyObject_Call + 0xbc (0x556c4547df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #21: + 0x1445a6 (0x565454a5e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x556c454642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5594d6a90142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #22: _PyObject_MakeTpCall + 0x26b (0x565454a57a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5594d6a9ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #23: + 0x150866 (0x565454a6a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #26: PyObject_Call + 0xbc (0x5594d6aa7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #28: _PyFunction_Vectorcall + 0x6c (0x556c45471a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x565454a53142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5594d6a8e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #25: _PyFunction_Vectorcall + 0x6c (0x565454a5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x556c454628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5594d6a9ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #26: PyObject_Call + 0xbc (0x565454a6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5594d6a8c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x565454a512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #28: _PyFunction_Vectorcall + 0x6c (0x565454a5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #30: + 0x150582 (0x5594d6aa7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #30: + 0x150582 (0x556c4547d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x565454a4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5594d6a8c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x556c454628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #32: + 0x150582 (0x556c4547d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #32: + 0x150582 (0x5594d6aa7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x556c454628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #30: + 0x150582 (0x565454a6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #34: + 0x150582 (0x556c4547d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x556c454628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x565454a4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #32: + 0x150582 (0x565454a6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5594d6a8c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x556c45469f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #37: _PyObject_Call_Prepend + 0x69 (0x556c4547bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x565454a4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #34: + 0x150582 (0x5594d6aa7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #38: + 0x211239 (0x556c4553e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #34: + 0x150582 (0x565454a6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5594d6a8c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5594d6a93f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x565454a4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #39: _PyObject_MakeTpCall + 0x26b (0x556c4546aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5594d6aa5c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x565454a56f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x556c454663e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #38: + 0x211239 (0x5594d6b68239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #37: _PyObject_Call_Prepend + 0x69 (0x565454a68c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5594d6a94a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5594d6a903e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #38: + 0x211239 (0x565454b2b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #41: _PyFunction_Vectorcall + 0x6c (0x556c45471a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5594d6a9ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x556c45461c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #43: _PyFunction_Vectorcall + 0x6c (0x556c45471a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #39: _PyObject_MakeTpCall + 0x26b (0x565454a57a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x565454a533e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5594d6a8bc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x556c454628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #41: _PyFunction_Vectorcall + 0x6c (0x565454a5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #45: + 0x150582 (0x556c4547d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x565454a4ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5594d6a9ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #46: PyObject_Call + 0xbc (0x556c4547df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #43: _PyFunction_Vectorcall + 0x6c (0x565454a5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5594d6a8c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x556c454642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #45: + 0x150582 (0x5594d6aa7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x565454a4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #45: + 0x150582 (0x565454a6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #46: PyObject_Call + 0xbc (0x5594d6aa7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #48: + 0x150582 (0x556c4547d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #49: PyObject_Call + 0xbc (0x556c4547df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #46: PyObject_Call + 0xbc (0x565454a6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5594d6a8e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x556c454642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x565454a512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #48: + 0x150582 (0x565454a6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #51: _PyFunction_Vectorcall + 0x6c (0x556c45471a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #48: + 0x150582 (0x5594d6aa7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x556c4546a007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #49: PyObject_Call + 0xbc (0x5594d6aa7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5594d6a8e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #49: PyObject_Call + 0xbc (0x565454a6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5594d6a9ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #53: _PyObject_Call_Prepend + 0x69 (0x556c4547bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x565454a512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5594d6a94007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5594d6aa5c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #51: _PyFunction_Vectorcall + 0x6c (0x565454a5ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #54: + 0x211239 (0x556c4553e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x565454a57007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #55: PyObject_Call + 0x207 (0x556c4547e067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #53: _PyObject_Call_Prepend + 0x69 (0x565454a68c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #54: + 0x211239 (0x565454b2b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x556c454642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #54: + 0x211239 (0x5594d6b68239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #55: PyObject_Call + 0x207 (0x565454a6b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #57: + 0x150582 (0x556c4547d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x565454a512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #55: PyObject_Call + 0x207 (0x5594d6aa8067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x556c454628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5594d6a8e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #57: + 0x150582 (0x565454a6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #59: + 0x150582 (0x556c4547d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #60: PyObject_Call + 0xbc (0x556c4547df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x565454a4f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #57: + 0x150582 (0x5594d6aa7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x556c454642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #59: + 0x150582 (0x565454a6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5594d6a8c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #60: PyObject_Call + 0xbc (0x565454a6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #62: + 0x150582 (0x556c4547d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #63: PyObject_Call + 0xbc (0x556c4547df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x565454a512b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #59: + 0x150582 (0x5594d6aa7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #62: + 0x150582 (0x565454a6a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #60: PyObject_Call + 0xbc (0x5594d6aa7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5594d6a8e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #63: PyObject_Call + 0xbc (0x565454a6af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank62]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank57]: frame #62: + 0x150582 (0x5594d6aa7582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #63: PyObject_Call + 0xbc (0x5594d6aa7f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default7]:[rank63]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default4]:[rank60]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank41]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank41]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f86b4f84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]: frame #1: + 0x5b3a23e (0x7f86eeaa123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f86eea9bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f86eea9bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default1]:[rank41]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f86eea9cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f86eea51371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f86eea51371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f86eea51371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f86eea51371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f86b625e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f86b6265610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f86b6284978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #12: + 0x5adc309 (0x7f86eea43309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #13: + 0x5ae6f10 (0x7f86eea4df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: frame #14: + 0x5ae6fa5 (0x7f86eea4dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #15: + 0x5124446 (0x7f86ee08b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #16: + 0x1acf4b8 (0x7f86eaa364b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: frame #17: + 0x5aee004 (0x7f86eea55004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #18: + 0x5af36b5 (0x7f86eea5a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #19: + 0xd2631e (0x7f870164431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: frame #20: + 0x47def4 (0x7f8700d9bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank41]: frame #21: + 0x1445a6 (0x5647ef7f65a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5647ef7efa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default1]:[rank41]: frame #23: + 0x150866 (0x5647ef802866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5647ef7eb142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5647ef7f6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: frame #26: PyObject_Call + 0xbc (0x5647ef802f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5647ef7e92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5647ef7f6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5647ef7e78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #30: + 0x150582 (0x5647ef802582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5647ef7e78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #32: + 0x150582 (0x5647ef802582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cl[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -uster/bin/python3.10) -[default1]:[rank41]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5647ef7e78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #34: + 0x150582 (0x5647ef802582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5647ef7e78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5647ef7eef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5647ef800c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #38: + 0x211239 (0x5647ef8c3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5647ef7efa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: pipeline_state.run_communication() -[default1]:[rank41]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5647ef7eb3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5647ef7f6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5647ef7e6c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5647ef7f6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5647ef7e78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #45: + 0x150582 (0x5647ef802582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #46: PyObject_Call + 0xbc (0x5647ef802f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5647ef7e92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #48: + 0x150582 (0x5647ef802582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #49: PyObject_Call + 0xbc (0x5647ef802f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/pyt[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -hon3.10) -[default1]:[rank41]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5647ef7e92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5647ef7f6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5647ef7ef007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5647ef800c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: frame #54: + 0x211239 (0x5647ef8c3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #55: PyObject_Call + 0x207 (0x5647ef803067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5647ef7e92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: frame #57: + 0x150582 (0x5647ef802582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5647ef7e78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #59: + 0x150582 (0x5647ef802582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #60: PyObject_Call + 0xbc (0x5647ef802f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5647ef7e92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: frame #62: + 0x150582 (0x5647ef802582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #63: PyObject_Call + 0xbc (0x5647ef802f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank63]: dist.recv( -[default4]:[rank60]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank63]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank60]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank63]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank60]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d8d0b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efee25bf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank60]: frame #1: + 0x5b3a23e (0x7f2dc6bd523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #1: + 0x5b3a23e (0x7eff1c0dc23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2dc6bcfc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7eff1c0d6c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2dc6bcff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7eff1c0d6f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f2dc6bd0fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7eff1c0d7fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2dc6b85371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7eff1c08c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2dc6b85371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2dc6b85371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7eff1c08c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2dc6b85371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2d8e392189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2d8e399610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2d8e3b8978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7eff1c08c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #12: + 0x5adc309 (0x7f2dc6b77309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #13: + 0x5ae6f10 (0x7f2dc6b81f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7eff1c08c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7efee3899189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7efee38a0610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #14: + 0x5ae6fa5 (0x7f2dc6b81fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7efee38bf978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #12: + 0x5adc309 (0x7eff1c07e309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #15: + 0x5124446 (0x7f2dc61bf446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #16: + 0x1acf4b8 (0x7f2dc2b6a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #17: + 0x5aee004 (0x7f2dc6b89004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #18: + 0x5af36b5 (0x7f2dc6b8e6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #13: + 0x5ae6f10 (0x7eff1c088f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #19: + 0xd2631e (0x7f2dd977831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: frame #14: + 0x5ae6fa5 (0x7eff1c088fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #20: + 0x47def4 (0x7f2dd8ecfef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #21: + 0x1445a6 (0x5562658275a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #22: _PyObject_MakeTpCall + 0x26b (0x556265820a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #15: + 0x5124446 (0x7eff1b6c6446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #23: + 0x150866 (0x556265833866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #16: + 0x1acf4b8 (0x7eff180714b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #17: + 0x5aee004 (0x7eff1c090004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55626581c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #18: + 0x5af36b5 (0x7eff1c0956b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #25: _PyFunction_Vectorcall + 0x6c (0x556265827a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #26: PyObject_Call + 0xbc (0x556265833f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #19: + 0xd2631e (0x7eff2ec7f31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: frame #20: + 0x47def4 (0x7eff2e3d6ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: frame #21: + 0x1445a6 (0x5626c0bec5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55626581a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5626c0be5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #23: + 0x150866 (0x5626c0bf8866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5626c0be1142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5626c0beca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #26: PyObject_Call + 0xbc (0x5626c0bf8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #28: _PyFunction_Vectorcall + 0x6c (0x556265827a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c0bdf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5626c0beca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5626c0bdd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #30: + 0x150582 (0x5626c0bf8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5562658188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5626c0bdd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #30: + 0x150582 (0x556265833582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5562658188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #32: + 0x150582 (0x556265833582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5562658188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #34: + 0x150582 (0x556265833582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5562658188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #32: + 0x150582 (0x5626c0bf8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55626581ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5626c0bdd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #37: _PyObject_Call_Prepend + 0x69 (0x556265831c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #34: + 0x150582 (0x5626c0bf8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #38: + 0x211239 (0x5562658f4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5626c0bdd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #39: _PyObject_MakeTpCall + 0x26b (0x556265820a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55626581c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #41: _PyFunction_Vectorcall + 0x6c (0x556265827a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x556265817c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5626c0be4f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #43: _PyFunction_Vectorcall + 0x6c (0x556265827a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5626c0bf6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5562658188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #38: + 0x211239 (0x5626c0cb9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #45: + 0x150582 (0x556265833582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5626c0be5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #46: PyObject_Call + 0xbc (0x556265833f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5626c0be13e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55626581a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5626c0beca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #48: + 0x150582 (0x556265833582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #49: PyObject_Call + 0xbc (0x556265833f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55626581a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #51: _PyFunction_Vectorcall + 0x6c (0x556265827a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x556265820007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #53: _PyObject_Call_Prepend + 0x69 (0x556265831c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #54: + 0x211239 (0x5562658f4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5626c0bdcc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #55: PyObject_Call + 0x207 (0x556265834067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55626581a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5626c0beca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #57: + 0x150582 (0x556265833582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5626c0bdd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5562658188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #45: + 0x150582 (0x5626c0bf8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #59: + 0x150582 (0x556265833582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #46: PyObject_Call + 0xbc (0x5626c0bf8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #60: PyObject_Call + 0xbc (0x556265833f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55626581a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c0bdf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #62: + 0x150582 (0x556265833582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #48: + 0x150582 (0x5626c0bf8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #63: PyObject_Call + 0xbc (0x556265833f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #49: PyObject_Call + 0xbc (0x5626c0bf8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank63]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c0bdf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5626c0beca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5626c0be5007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5626c0bf6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #54: + 0x211239 (0x5626c0cb9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #55: PyObject_Call + 0x207 (0x5626c0bf9067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c0bdf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #57: + 0x150582 (0x5626c0bf8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5626c0bdd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #59: + 0x150582 (0x5626c0bf8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #60: PyObject_Call + 0xbc (0x5626c0bf8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c0bdf2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #62: + 0x150582 (0x5626c0bf8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #63: PyObject_Call + 0xbc (0x5626c0bf8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank45]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff249b85897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]: frame #1: + 0x5b3a23e (0x7ff2836a223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff28369cc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff28369cf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff28369dfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff283652371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff283652371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff283652371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff283652371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff24ae5f189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff24ae66610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff24ae85978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #12: + 0x5adc309 (0x7ff283644309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #13: + 0x5ae6f10 (0x7ff28364ef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #14: + 0x5ae6fa5 (0x7ff28364efa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #15: + 0x5124446 (0x7ff282c8c446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #16: + 0x1acf4b8 (0x7ff27f6374b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #17: + 0x5aee004 (0x7ff283656004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #18: + 0x5af36b5 (0x7ff28365b6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #19: + 0xd2631e (0x7ff29624531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #20: + 0x47def4 (0x7ff29599cef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #21: + 0x1445a6 (0x55826f2205a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55826f219a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #23: + 0x150866 (0x55826f22c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55826f215142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55826f220a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #26: PyObject_Call + 0xbc (0x55826f22cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55826f2132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55826f220a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55826f2118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #30: + 0x150582 (0x55826f22c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55826f2118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #32: + 0x150582 (0x55826f22c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55826f2118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #34: + 0x150582 (0x55826f22c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55826f2118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55826f218f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55826f22ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #38: + 0x211239 (0x55826f2ed239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55826f219a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55826f2153e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55826f220a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55826f210c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55826f220a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55826f2118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #45: + 0x150582 (0x55826f22c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #46: PyObject_Call + 0xbc (0x55826f22cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55826f2132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #48: + 0x150582 (0x55826f22c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #49: PyObject_Call + 0xbc (0x55826f22cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55826f2132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55826f220a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55826f219007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55826f22ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #54: + 0x211239 (0x55826f2ed239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #55: PyObject_Call + 0x207 (0x55826f22d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55826f2132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #57: + 0x150582 (0x55826f22c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55826f2118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #59: + 0x150582 (0x55826f22c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #60: PyObject_Call + 0xbc (0x55826f22cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55826f2132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #62: + 0x150582 (0x55826f22c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #63: PyObject_Call + 0xbc (0x55826f22cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: output = model(**micro_batch) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default7]:[rank47]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank47]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac2f4d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: frame #1: + 0x5b3a23e (0x7fac68ff623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fac68ff0c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fac68ff0f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fac68ff1fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fac68fa6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fac68fa6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fac68fa6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fac68fa6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fac307b3189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fac307ba610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: pipeline_state.run_communication() -[default7]:[rank47]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fac307d9978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: frame #12: + 0x5adc309 (0x7fac68f98309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: frame #13: + 0x5ae6f10 (0x7fac68fa2f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: frame #14: + 0x5ae6fa5 (0x7fac68fa2fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: frame #15: + 0x5124446 (0x7fac685e0446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #16: + 0x1acf4b8 (0x7fac64f8b4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: frame #17: + 0x5aee004 (0x7fac68faa004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #18: + 0x5af36b5 (0x7fac68faf6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: frame #19: + 0xd2631e (0x7fac7bb9931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank43]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank43]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9a76439897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: frame #20: + 0x47def4 (0x7fac7b2f0ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #21: + 0x1445a6 (0x55d1d381b5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #1: + 0x5b3a23e (0x7f9aaff5623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f9aaff50c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d1d3814a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #23: + 0x150866 (0x55d1d3827866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d1d3810142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d1d381ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #26: PyObject_Call + 0xbc (0x55d1d3827f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1d380e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d1d381ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f9aaff50f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f9aaff51fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d1d380c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9aaff06371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #30: + 0x150582 (0x55d1d3827582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d1d380c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #32: + 0x150582 (0x55d1d3827582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9aaff06371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9aaff06371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f9aaff06371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f9a77713189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f9a7771a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d1d380c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f9a77739978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #12: + 0x5adc309 (0x7f9aafef8309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #13: + 0x5ae6f10 (0x7f9aaff02f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #34: + 0x150582 (0x55d1d3827582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d1d380c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d1d3813f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #14: + 0x5ae6fa5 (0x7f9aaff02fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #15: + 0x5124446 (0x7f9aaf540446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d1d3825c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #16: + 0x1acf4b8 (0x7f9aabeeb4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #38: + 0x211239 (0x55d1d38e8239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d1d3814a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #17: + 0x5aee004 (0x7f9aaff0a004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #18: + 0x5af36b5 (0x7f9aaff0f6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #19: + 0xd2631e (0x7f9ac2af931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d1d38103e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d1d381ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d1d380bc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d1d381ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #20: + 0x47def4 (0x7f9ac2250ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #21: + 0x1445a6 (0x55bf0bc295a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55bf0bc22a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #23: + 0x150866 (0x55bf0bc35866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55bf0bc1e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55bf0bc29a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #26: PyObject_Call + 0xbc (0x55bf0bc35f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf0bc1c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d1d380c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #45: + 0x150582 (0x55d1d3827582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55bf0bc29a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55bf0bc1a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #30: + 0x150582 (0x55bf0bc35582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55bf0bc1a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #32: + 0x150582 (0x55bf0bc35582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55bf0bc1a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #34: + 0x150582 (0x55bf0bc35582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55bf0bc1a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55bf0bc21f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55bf0bc33c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #38: + 0x211239 (0x55bf0bcf6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55bf0bc22a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55bf0bc1e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55bf0bc29a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55bf0bc19c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55bf0bc29a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55bf0bc1a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #45: + 0x150582 (0x55bf0bc35582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #46: PyObject_Call + 0xbc (0x55d1d3827f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #46: PyObject_Call + 0xbc (0x55bf0bc35f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf0bc1c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #48: + 0x150582 (0x55bf0bc35582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #49: PyObject_Call + 0xbc (0x55bf0bc35f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf0bc1c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1d380e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #48: + 0x150582 (0x55d1d3827582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #49: PyObject_Call + 0xbc (0x55d1d3827f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55bf0bc29a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55bf0bc22007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1d380e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d1d381ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d1d3814007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55bf0bc33c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #54: + 0x211239 (0x55bf0bcf6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d1d3825c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #55: PyObject_Call + 0x207 (0x55bf0bc36067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf0bc1c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #57: + 0x150582 (0x55bf0bc35582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55bf0bc1a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #54: + 0x211239 (0x55d1d38e8239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #59: + 0x150582 (0x55bf0bc35582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #60: PyObject_Call + 0xbc (0x55bf0bc35f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf0bc1c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #62: + 0x150582 (0x55bf0bc35582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #63: PyObject_Call + 0xbc (0x55bf0bc35f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #55: PyObject_Call + 0x207 (0x55d1d3828067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1d380e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #57: + 0x150582 (0x55d1d3827582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d1d380c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: frame #59: + 0x150582 (0x55d1d3827582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #60: PyObject_Call + 0xbc (0x55d1d3827f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1d380e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #62: + 0x150582 (0x55d1d3827582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #63: PyObject_Call + 0xbc (0x55d1d3827f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank59]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank59]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a29a50897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: frame #1: + 0x5b3a23e (0x7f3a6356d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f3a63567c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f3a63567f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f3a63568fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3a6351d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3a6351d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3a6351d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3a6351d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f3a2ad2a189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f3a2ad31610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f3a2ad50978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #12: + 0x5adc309 (0x7f3a6350f309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #13: + 0x5ae6f10 (0x7f3a63519f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #14: + 0x5ae6fa5 (0x7f3a63519fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #15: + 0x5124446 (0x7f3a62b57446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #16: + 0x1acf4b8 (0x7f3a5f5024b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #17: + 0x5aee004 (0x7f3a63521004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #18: + 0x5af36b5 (0x7f3a635266b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #19: + 0xd2631e (0x7f3a7611031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #20: + 0x47def4 (0x7f3a75867ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #21: + 0x1445a6 (0x5587bbe275a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5587bbe20a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #23: + 0x150866 (0x5587bbe33866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5587bbe1c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5587bbe27a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #26: PyObject_Call + 0xbc (0x5587bbe33f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5587bbe1a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5587bbe27a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5587bbe188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #30: + 0x150582 (0x5587bbe33582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5587bbe188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #32: + 0x150582 (0x5587bbe33582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5587bbe188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #34: + 0x150582 (0x5587bbe33582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5587bbe188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5587bbe1ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5587bbe31c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #38: + 0x211239 (0x5587bbef4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5587bbe20a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5587bbe1c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5587bbe27a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5587bbe17c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5587bbe27a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5587bbe188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #45: + 0x150582 (0x5587bbe33582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #46: PyObject_Call + 0xbc (0x5587bbe33f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5587bbe1a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #48: + 0x150582 (0x5587bbe33582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #49: PyObject_Call + 0xbc (0x5587bbe33f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5587bbe1a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5587bbe27a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5587bbe20007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5587bbe31c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #54: + 0x211239 (0x5587bbef4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #55: PyObject_Call + 0x207 (0x5587bbe34067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5587bbe1a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #57: + 0x150582 (0x5587bbe33582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5587bbe188fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #59: + 0x150582 (0x5587bbe33582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #60: PyObject_Call + 0xbc (0x5587bbe33f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5587bbe1a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #62: + 0x150582 (0x5587bbe33582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #63: PyObject_Call + 0xbc (0x5587bbe33f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f414dfbd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #1: + 0x5b3a23e (0x7f4187ada23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4187ad4c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4187ad4f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4187ad5fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4187a8a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4187a8a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4187a8a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4187a8a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f414f297189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f414f29e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f414f2bd978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #12: + 0x5adc309 (0x7f4187a7c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #13: + 0x5ae6f10 (0x7f4187a86f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank42]: frame #14: + 0x5ae6fa5 (0x7f4187a86fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: frame #15: + 0x5124446 (0x7f41870c4446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: return func(*args, **kwargs) -[default2]:[rank42]: frame #16: + 0x1acf4b8 (0x7f4183a6f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #17: + 0x5aee004 (0x7f4187a8e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #18: + 0x5af36b5 (0x7f4187a936b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: frame #19: + 0xd2631e (0x7f419a67d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #20: + 0x47def4 (0x7f4199dd4ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #21: + 0x1445a6 (0x5609653455a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56096533ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank44]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d81c0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: frame #1: + 0x5b3a23e (0x7f5dbb72723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5dbb721c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5dbb721f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5dbb722fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #23: + 0x150866 (0x560965351866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56096533a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #25: _PyFunction_Vectorcall + 0x6c (0x560965345a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #26: PyObject_Call + 0xbc (0x560965351f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5609653382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #28: _PyFunction_Vectorcall + 0x6c (0x560965345a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5609653368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5dbb6d7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #30: + 0x150582 (0x560965351582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5609653368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #32: + 0x150582 (0x560965351582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5dbb6d7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5dbb6d7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5609653368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #34: + 0x150582 (0x560965351582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5609653368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56096533df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56096534fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5dbb6d7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5d82ee4189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5d82eeb610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5d82f0a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #38: + 0x211239 (0x560965412239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56096533ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56096533a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #12: + 0x5adc309 (0x7f5dbb6c9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #13: + 0x5ae6f10 (0x7f5dbb6d3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #41: _PyFunction_Vectorcall + 0x6c (0x560965345a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x560965335c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #43: _PyFunction_Vectorcall + 0x6c (0x560965345a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5609653368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #45: + 0x150582 (0x560965351582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #46: PyObject_Call + 0xbc (0x560965351f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5609653382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #48: + 0x150582 (0x560965351582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #49: PyObject_Call + 0xbc (0x560965351f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #14: + 0x5ae6fa5 (0x7f5dbb6d3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5609653382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #51: _PyFunction_Vectorcall + 0x6c (0x560965345a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #15: + 0x5124446 (0x7f5dbad11446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56096533e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #16: + 0x1acf4b8 (0x7f5db76bc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #17: + 0x5aee004 (0x7f5dbb6db004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56096534fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #54: + 0x211239 (0x560965412239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #55: PyObject_Call + 0x207 (0x560965352067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5609653382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #18: + 0x5af36b5 (0x7f5dbb6e06b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #57: + 0x150582 (0x560965351582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5609653368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #59: + 0x150582 (0x560965351582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #60: PyObject_Call + 0xbc (0x560965351f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5609653382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #62: + 0x150582 (0x560965351582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #63: PyObject_Call + 0xbc (0x560965351f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank44]: frame #19: + 0xd2631e (0x7f5dce2ca31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #20: + 0x47def4 (0x7f5dcda21ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #21: + 0x1445a6 (0x55b5963465a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b59633fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #23: + 0x150866 (0x55b596352866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b59633b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b596346a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #26: PyObject_Call + 0xbc (0x55b596352f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5963392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b596346a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b5963378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #30: + 0x150582 (0x55b596352582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b5963378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #32: + 0x150582 (0x55b596352582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b5963378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #34: + 0x150582 (0x55b596352582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b5963378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b59633ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b596350c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #38: + 0x211239 (0x55b596413239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b59633fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b59633b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b596346a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b596336c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b596346a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b5963378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #45: + 0x150582 (0x55b596352582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #46: PyObject_Call + 0xbc (0x55b596352f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5963392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #48: + 0x150582 (0x55b596352582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #49: PyObject_Call + 0xbc (0x55b596352f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5963392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b596346a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b59633f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b596350c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #54: + 0x211239 (0x55b596413239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #55: PyObject_Call + 0x207 (0x55b596353067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5963392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #57: + 0x150582 (0x55b596352582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b5963378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #59: + 0x150582 (0x55b596352582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #60: PyObject_Call + 0xbc (0x55b596352f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5963392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #62: + 0x150582 (0x55b596352582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #63: PyObject_Call + 0xbc (0x55b596352f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank52]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank52]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2eed2f1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: frame #1: + 0x5b3a23e (0x7f2f26e0e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2f26e08c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2f26e08f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f2f26e09fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2f26dbe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2f26dbe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2f26dbe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2f26dbe371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2eee5cb189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2eee5d2610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2eee5f1978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #12: + 0x5adc309 (0x7f2f26db0309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #13: + 0x5ae6f10 (0x7f2f26dbaf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #14: + 0x5ae6fa5 (0x7f2f26dbafa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #15: + 0x5124446 (0x7f2f263f8446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #16: + 0x1acf4b8 (0x7f2f22da34b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #17: + 0x5aee004 (0x7f2f26dc2004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #18: + 0x5af36b5 (0x7f2f26dc76b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #19: + 0xd2631e (0x7f2f399b131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #20: + 0x47def4 (0x7f2f39108ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #21: + 0x1445a6 (0x557875f2d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557875f26a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #23: + 0x150866 (0x557875f39866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557875f22142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557875f2da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #26: PyObject_Call + 0xbc (0x557875f39f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x557875f202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557875f2da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x557875f1e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #30: + 0x150582 (0x557875f39582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x557875f1e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #32: + 0x150582 (0x557875f39582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x557875f1e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #34: + 0x150582 (0x557875f39582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x557875f1e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557875f25f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557875f37c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #38: + 0x211239 (0x557875ffa239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557875f26a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x557875f223e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557875f2da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557875f1dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557875f2da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x557875f1e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #45: + 0x150582 (0x557875f39582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #46: PyObject_Call + 0xbc (0x557875f39f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x557875f202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #48: + 0x150582 (0x557875f39582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #49: PyObject_Call + 0xbc (0x557875f39f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x557875f202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557875f2da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557875f26007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557875f37c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #54: + 0x211239 (0x557875ffa239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #55: PyObject_Call + 0x207 (0x557875f3a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x557875f202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #57: + 0x150582 (0x557875f39582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x557875f1e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #59: + 0x150582 (0x557875f39582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #60: PyObject_Call + 0xbc (0x557875f39f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x557875f202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #62: + 0x150582 (0x557875f39582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #63: PyObject_Call + 0xbc (0x557875f39f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank55]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank55]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feff5803897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: frame #1: + 0x5b3a23e (0x7ff02f32023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff02f31ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff02f31af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff02f31bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff02f2d0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff02f2d0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff02f2d0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff02f2d0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7feff6add189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7feff6ae4610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7feff6b03978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #12: + 0x5adc309 (0x7ff02f2c2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #13: + 0x5ae6f10 (0x7ff02f2ccf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #14: + 0x5ae6fa5 (0x7ff02f2ccfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #15: + 0x5124446 (0x7ff02e90a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #16: + 0x1acf4b8 (0x7ff02b2b54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #17: + 0x5aee004 (0x7ff02f2d4004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #18: + 0x5af36b5 (0x7ff02f2d96b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #19: + 0xd2631e (0x7ff041ec331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #20: + 0x47def4 (0x7ff04161aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #21: + 0x1445a6 (0x561cdfbc25a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #22: _PyObject_MakeTpCall + 0x26b (0x561cdfbbba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #23: + 0x150866 (0x561cdfbce866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x561cdfbb7142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #25: _PyFunction_Vectorcall + 0x6c (0x561cdfbc2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #26: PyObject_Call + 0xbc (0x561cdfbcef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x561cdfbb52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #28: _PyFunction_Vectorcall + 0x6c (0x561cdfbc2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x561cdfbb38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #30: + 0x150582 (0x561cdfbce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x561cdfbb38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #32: + 0x150582 (0x561cdfbce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x561cdfbb38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #34: + 0x150582 (0x561cdfbce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x561cdfbb38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x561cdfbbaf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #37: _PyObject_Call_Prepend + 0x69 (0x561cdfbccc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #38: + 0x211239 (0x561cdfc8f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #39: _PyObject_MakeTpCall + 0x26b (0x561cdfbbba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x561cdfbb73e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #41: _PyFunction_Vectorcall + 0x6c (0x561cdfbc2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x561cdfbb2c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #43: _PyFunction_Vectorcall + 0x6c (0x561cdfbc2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x561cdfbb38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #45: + 0x150582 (0x561cdfbce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #46: PyObject_Call + 0xbc (0x561cdfbcef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x561cdfbb52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #48: + 0x150582 (0x561cdfbce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #49: PyObject_Call + 0xbc (0x561cdfbcef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x561cdfbb52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #51: _PyFunction_Vectorcall + 0x6c (0x561cdfbc2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x561cdfbbb007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #53: _PyObject_Call_Prepend + 0x69 (0x561cdfbccc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #54: + 0x211239 (0x561cdfc8f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #55: PyObject_Call + 0x207 (0x561cdfbcf067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x561cdfbb52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #57: + 0x150582 (0x561cdfbce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x561cdfbb38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #59: + 0x150582 (0x561cdfbce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #60: PyObject_Call + 0xbc (0x561cdfbcef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x561cdfbb52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #62: + 0x150582 (0x561cdfbce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #63: PyObject_Call + 0xbc (0x561cdfbcef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: trainer.train(dataloader) -[default6]:[rank38]: sharded_logits = self.model( -[default3]:[rank35]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: sharded_logits = self.model( -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank38]: return func(*args, **kwargs) -[default4]:[rank36]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default6]:[rank38]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f88b8b8f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default6]:[rank38]: frame #1: + 0x5b3a23e (0x7f88f26ac23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f88f26a6c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank36]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1eddc5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f88f26a6f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #1: + 0x5b3a23e (0x7ff2278e223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff2278dcc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff2278dcf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff2278ddfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff227892371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f88f26a7fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f88f265c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f88f265c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f88f265c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f88f265c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff227892371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff227892371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank38]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f88b9e69189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f88b9e70610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f88b9e8f978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #12: + 0x5adc309 (0x7f88f264e309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #13: + 0x5ae6f10 (0x7f88f2658f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff227892371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff1ef09f189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #14: + 0x5ae6fa5 (0x7f88f2658fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #15: + 0x5124446 (0x7f88f1c96446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #16: + 0x1acf4b8 (0x7f88ee6414b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: return func(*args, **kwargs) -[default4]:[rank36]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff1ef0a6610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff1ef0c5978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #17: + 0x5aee004 (0x7f88f2660004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: frame #12: + 0x5adc309 (0x7ff227884309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #13: + 0x5ae6f10 (0x7ff22788ef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #18: + 0x5af36b5 (0x7f88f26656b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #19: + 0xd2631e (0x7f890524f31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: frame #20: + 0x47def4 (0x7f89049a6ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #21: + 0x1445a6 (0x5626c50a05a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #14: + 0x5ae6fa5 (0x7ff22788efa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #15: + 0x5124446 (0x7ff226ecc446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5626c5099a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank35]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f08cede7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank38]: frame #23: + 0x150866 (0x5626c50ac866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #16: + 0x1acf4b8 (0x7ff2238774b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #17: + 0x5aee004 (0x7ff227896004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5626c5095142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5626c50a0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #1: + 0x5b3a23e (0x7f090890423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f09088fec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #26: PyObject_Call + 0xbc (0x5626c50acf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #18: + 0x5af36b5 (0x7ff22789b6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c50932b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #19: + 0xd2631e (0x7ff23a48531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #20: + 0x47def4 (0x7ff239bdcef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5626c50a0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #21: + 0x1445a6 (0x55a4c2f455a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5626c50918fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #30: + 0x150582 (0x5626c50ac582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a4c2f3ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #23: + 0x150866 (0x55a4c2f51866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a4c2f3a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a4c2f45a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #26: PyObject_Call + 0xbc (0x55a4c2f51f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5626c50918fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #32: + 0x150582 (0x5626c50ac582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f09088fef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a4c2f382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5626c50918fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #34: + 0x150582 (0x5626c50ac582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a4c2f45a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5626c50918fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a4c2f368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5626c5098f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #30: + 0x150582 (0x55a4c2f51582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f09088fffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f09088b4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f09088b4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5626c50aac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #38: + 0x211239 (0x5626c516d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5626c5099a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a4c2f368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #32: + 0x150582 (0x55a4c2f51582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a4c2f368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5626c50953e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f09088b4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f09088b4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f08d00c1189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f08d00c8610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f08d00e7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5626c50a0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #34: + 0x150582 (0x55a4c2f51582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a4c2f368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5626c5090c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #12: + 0x5adc309 (0x7f09088a6309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5626c50a0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a4c2f3df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5626c50918fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #13: + 0x5ae6f10 (0x7f09088b0f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a4c2f4fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #45: + 0x150582 (0x5626c50ac582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #38: + 0x211239 (0x55a4c3012239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank38]: frame #46: PyObject_Call + 0xbc (0x5626c50acf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c50932b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a4c2f3ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #48: + 0x150582 (0x5626c50ac582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #49: PyObject_Call + 0xbc (0x5626c50acf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c50932b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a4c2f3a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a4c2f45a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a4c2f35c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: frame #14: + 0x5ae6fa5 (0x7f09088b0fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #15: + 0x5124446 (0x7f0907eee446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5626c50a0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: frame #16: + 0x1acf4b8 (0x7f09048994b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5626c5099007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank38]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5626c50aac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a4c2f45a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a4c2f368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: frame #54: + 0x211239 (0x5626c516d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: output = model(**micro_batch) -[default4]:[rank36]: frame #45: + 0x150582 (0x55a4c2f51582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: frame #55: PyObject_Call + 0x207 (0x5626c50ad067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c50932b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default3]:[rank35]: frame #17: + 0x5aee004 (0x7f09088b8004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #46: PyObject_Call + 0xbc (0x55a4c2f51f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: frame #18: + 0x5af36b5 (0x7f09088bd6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a4c2f382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #57: + 0x150582 (0x5626c50ac582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5626c50918fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: frame #19: + 0xd2631e (0x7f091b4a731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #48: + 0x150582 (0x55a4c2f51582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #49: PyObject_Call + 0xbc (0x55a4c2f51f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #59: + 0x150582 (0x5626c50ac582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a4c2f382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #60: PyObject_Call + 0xbc (0x5626c50acf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5626c50932b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #20: + 0x47def4 (0x7f091abfeef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: sharded_logits = self.model( -[default6]:[rank38]: frame #62: + 0x150582 (0x5626c50ac582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank54]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank36]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a4c2f45a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a4c2f3e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a4c2f4fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: frame #21: + 0x1445a6 (0x564e6c3295a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: frame #54: + 0x211239 (0x55a4c3012239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8abf3f0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank36]: frame #55: PyObject_Call + 0x207 (0x55a4c2f52067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: frame #63: PyObject_Call + 0xbc (0x5626c50acf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: frame #1: + 0x5b3a23e (0x7f8af8f0d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a4c2f382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #22: _PyObject_MakeTpCall + 0x26b (0x564e6c322a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f8af8f07c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #23: + 0x150866 (0x564e6c335866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f8af8f07f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f8af8f08fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #57: + 0x150582 (0x55a4c2f51582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x564e6c31e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8af8ebd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8af8ebd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a4c2f368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #25: _PyFunction_Vectorcall + 0x6c (0x564e6c329a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: frame #59: + 0x150582 (0x55a4c2f51582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #60: PyObject_Call + 0xbc (0x55a4c2f51f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8af8ebd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #26: PyObject_Call + 0xbc (0x564e6c335f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8af8ebd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x564e6c31c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f8ac06ca189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a4c2f382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: frame #62: + 0x150582 (0x55a4c2f51582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: frame #63: PyObject_Call + 0xbc (0x55a4c2f51f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default3]:[rank35]: frame #28: _PyFunction_Vectorcall + 0x6c (0x564e6c329a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f8ac06d1610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f8ac06f0978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: frame #12: + 0x5adc309 (0x7f8af8eaf309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #13: + 0x5ae6f10 (0x7f8af8eb9f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x564e6c31a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #30: + 0x150582 (0x564e6c335582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #14: + 0x5ae6fa5 (0x7f8af8eb9fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x564e6c31a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #32: + 0x150582 (0x564e6c335582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x564e6c31a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #15: + 0x5124446 (0x7f8af84f7446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #34: + 0x150582 (0x564e6c335582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x564e6c31a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x564e6c321f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #37: _PyObject_Call_Prepend + 0x69 (0x564e6c333c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: frame #38: + 0x211239 (0x564e6c3f6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #39: _PyObject_MakeTpCall + 0x26b (0x564e6c322a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x564e6c31e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #41: _PyFunction_Vectorcall + 0x6c (0x564e6c329a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: frame #16: + 0x1acf4b8 (0x7f8af4ea24b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x564e6c319c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #43: _PyFunction_Vectorcall + 0x6c (0x564e6c329a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x564e6c31a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #17: + 0x5aee004 (0x7f8af8ec1004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #18: + 0x5af36b5 (0x7f8af8ec66b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #45: + 0x150582 (0x564e6c335582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #46: PyObject_Call + 0xbc (0x564e6c335f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x564e6c31c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #19: + 0xd2631e (0x7f8b0bab031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #48: + 0x150582 (0x564e6c335582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #49: PyObject_Call + 0xbc (0x564e6c335f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x564e6c31c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #51: _PyFunction_Vectorcall + 0x6c (0x564e6c329a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #20: + 0x47def4 (0x7f8b0b207ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x564e6c322007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #53: _PyObject_Call_Prepend + 0x69 (0x564e6c333c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #54: + 0x211239 (0x564e6c3f6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #55: PyObject_Call + 0x207 (0x564e6c336067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #21: + 0x1445a6 (0x5558b65135a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x564e6c31c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #57: + 0x150582 (0x564e6c335582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x564e6c31a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #59: + 0x150582 (0x564e6c335582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5558b650ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: frame #60: PyObject_Call + 0xbc (0x564e6c335f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x564e6c31c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #62: + 0x150582 (0x564e6c335582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: frame #63: PyObject_Call + 0xbc (0x564e6c335f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: frame #23: + 0x150866 (0x5558b651f866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5558b6508142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5558b6513a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank50]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank54]: frame #26: PyObject_Call + 0xbc (0x5558b651ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5558b65062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5558b6513a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5558b65048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #30: + 0x150582 (0x5558b651f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5558b65048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #32: + 0x150582 (0x5558b651f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5558b65048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #34: + 0x150582 (0x5558b651f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5558b65048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5558b650bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5558b651dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #38: + 0x211239 (0x5558b65e0239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5558b650ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5558b65083e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5558b6513a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5558b6503c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5558b6513a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5558b65048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #45: + 0x150582 (0x5558b651f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #46: PyObject_Call + 0xbc (0x5558b651ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5558b65062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #48: + 0x150582 (0x5558b651f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #49: PyObject_Call + 0xbc (0x5558b651ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5558b65062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa0c6825897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank54]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5558b6513a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5558b650c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #1: + 0x5b3a23e (0x7fa10034223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5558b651dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #54: + 0x211239 (0x5558b65e0239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #55: PyObject_Call + 0x207 (0x5558b6520067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5558b65062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #57: + 0x150582 (0x5558b651f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5558b65048fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #59: + 0x150582 (0x5558b651f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #60: PyObject_Call + 0xbc (0x5558b651ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5558b65062b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #62: + 0x150582 (0x5558b651f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #63: PyObject_Call + 0xbc (0x5558b651ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa10033cc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa10033cf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa10033dfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa1002f2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa1002f2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa1002f2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa1002f2371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa0c7aff189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa0c7b06610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa0c7b25978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #12: + 0x5adc309 (0x7fa1002e4309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #13: + 0x5ae6f10 (0x7fa1002eef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #14: + 0x5ae6fa5 (0x7fa1002eefa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #15: + 0x5124446 (0x7fa0ff92c446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #16: + 0x1acf4b8 (0x7fa0fc2d74b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #17: + 0x5aee004 (0x7fa1002f6004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #18: + 0x5af36b5 (0x7fa1002fb6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #19: + 0xd2631e (0x7fa112ee531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #20: + 0x47def4 (0x7fa11263cef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #21: + 0x1445a6 (0x559171a765a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559171a6fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #23: + 0x150866 (0x559171a82866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559171a6b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559171a76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #26: PyObject_Call + 0xbc (0x559171a82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559171a692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559171a76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559171a678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #30: + 0x150582 (0x559171a82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559171a678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #32: + 0x150582 (0x559171a82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559171a678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #34: + 0x150582 (0x559171a82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559171a678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559171a6ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559171a80c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #38: + 0x211239 (0x559171b43239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559171a6fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559171a6b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559171a76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559171a66c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559171a76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559171a678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #45: + 0x150582 (0x559171a82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #46: PyObject_Call + 0xbc (0x559171a82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559171a692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #48: + 0x150582 (0x559171a82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #49: PyObject_Call + 0xbc (0x559171a82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559171a692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559171a76a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559171a6f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559171a80c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #54: + 0x211239 (0x559171b43239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #55: PyObject_Call + 0x207 (0x559171a83067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559171a692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #57: + 0x150582 (0x559171a82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559171a678fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #59: + 0x150582 (0x559171a82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #60: PyObject_Call + 0xbc (0x559171a82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559171a692b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #62: + 0x150582 (0x559171a82582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #63: PyObject_Call + 0xbc (0x559171a82f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: Traceback (most recent call last): -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default3]:[rank51]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank49]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank49]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff0d703f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank49]: frame #1: + 0x5b3a23e (0x7ff110b5c23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff110b56c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff110b56f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff110b57fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff110b0c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default1]:[rank49]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff110b0c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff110b0c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff110b0c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff0d8319189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff0d8320610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff0d833f978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #12: + 0x5adc309 (0x7ff110afe309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: frame #13: + 0x5ae6f10 (0x7ff110b08f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: frame #14: + 0x5ae6fa5 (0x7ff110b08fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: frame #15: + 0x5124446 (0x7ff110146446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: dist.recv( -[default1]:[rank49]: frame #16: + 0x1acf4b8 (0x7ff10caf14b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #17: + 0x5aee004 (0x7ff110b10004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: frame #18: + 0x5af36b5 (0x7ff110b156b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #19: + 0xd2631e (0x7ff1236ff31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: return func(*args, **kwargs) -[default1]:[rank49]: frame #20: + 0x47def4 (0x7ff122e56ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank51]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank51]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f319e2ea897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank51]: frame #1: + 0x5b3a23e (0x7f31d7e0723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #21: + 0x1445a6 (0x55d4adb3b5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f31d7e01c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d4adb34a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #23: + 0x150866 (0x55d4adb47866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d4adb30142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f31d7e01f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f31d7e02fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d4adb3ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f31d7db7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #26: PyObject_Call + 0xbc (0x55d4adb47f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d4adb2e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f31d7db7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d4adb3ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f31d7db7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d4adb2c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f31d7db7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f319f5c4189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f319f5cb610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f319f5ea978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #30: + 0x150582 (0x55d4adb47582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #12: + 0x5adc309 (0x7f31d7da9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d4adb2c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #13: + 0x5ae6f10 (0x7f31d7db3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #32: + 0x150582 (0x55d4adb47582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d4adb2c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #14: + 0x5ae6fa5 (0x7f31d7db3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #34: + 0x150582 (0x55d4adb47582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #15: + 0x5124446 (0x7f31d73f1446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #16: + 0x1acf4b8 (0x7f31d3d9c4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d4adb2c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #17: + 0x5aee004 (0x7f31d7dbb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d4adb33f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d4adb45c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #38: + 0x211239 (0x55d4adc08239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #18: + 0x5af36b5 (0x7f31d7dc06b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #19: + 0xd2631e (0x7f31ea9aa31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #20: + 0x47def4 (0x7f31ea101ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #21: + 0x1445a6 (0x55bf10c165a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55bf10c0fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #23: + 0x150866 (0x55bf10c22866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55bf10c0b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55bf10c16a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d4adb34a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d4adb303e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d4adb3ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d4adb2bc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d4adb3ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #26: PyObject_Call + 0xbc (0x55bf10c22f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf10c092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55bf10c16a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55bf10c078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d4adb2c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #45: + 0x150582 (0x55d4adb47582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #46: PyObject_Call + 0xbc (0x55d4adb47f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d4adb2e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #48: + 0x150582 (0x55d4adb47582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #30: + 0x150582 (0x55bf10c22582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #49: PyObject_Call + 0xbc (0x55d4adb47f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55bf10c078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d4adb2e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d4adb3ba2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d4adb34007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #32: + 0x150582 (0x55bf10c22582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55bf10c078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d4adb45c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #54: + 0x211239 (0x55d4adc08239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #55: PyObject_Call + 0x207 (0x55d4adb48067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d4adb2e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #34: + 0x150582 (0x55bf10c22582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55bf10c078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #57: + 0x150582 (0x55d4adb47582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d4adb2c8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #59: + 0x150582 (0x55d4adb47582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55bf10c0ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #60: PyObject_Call + 0xbc (0x55d4adb47f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55bf10c20c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d4adb2e2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #38: + 0x211239 (0x55bf10ce3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55bf10c0fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #62: + 0x150582 (0x55d4adb47582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #63: PyObject_Call + 0xbc (0x55d4adb47f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55bf10c0b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55bf10c16a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55bf10c06c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55bf10c16a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55bf10c078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #45: + 0x150582 (0x55bf10c22582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #46: PyObject_Call + 0xbc (0x55bf10c22f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf10c092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #48: + 0x150582 (0x55bf10c22582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #49: PyObject_Call + 0xbc (0x55bf10c22f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf10c092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55bf10c16a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55bf10c0f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55bf10c20c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #54: + 0x211239 (0x55bf10ce3239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #55: PyObject_Call + 0x207 (0x55bf10c23067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf10c092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #57: + 0x150582 (0x55bf10c22582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55bf10c078fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #59: + 0x150582 (0x55bf10c22582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #60: PyObject_Call + 0xbc (0x55bf10c22f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55bf10c092b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #62: + 0x150582 (0x55bf10c22582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #63: PyObject_Call + 0xbc (0x55bf10c22f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank53]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank53]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f91833d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: frame #1: + 0x5b3a23e (0x7f91bcef623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f91bcef0c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f91bcef0f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f91bcef1fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f91bcea6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f91bcea6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f91bcea6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f91bcea6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f91846b3189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f91846ba610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f91846d9978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #12: + 0x5adc309 (0x7f91bce98309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #13: + 0x5ae6f10 (0x7f91bcea2f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #14: + 0x5ae6fa5 (0x7f91bcea2fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #15: + 0x5124446 (0x7f91bc4e0446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #16: + 0x1acf4b8 (0x7f91b8e8b4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #17: + 0x5aee004 (0x7f91bceaa004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #18: + 0x5af36b5 (0x7f91bceaf6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #19: + 0xd2631e (0x7f91cfa9931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: frame #20: + 0x47def4 (0x7f91cf1f0ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: frame #21: + 0x1445a6 (0x55dbeb7ed5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55dbeb7e6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #23: + 0x150866 (0x55dbeb7f9866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55dbeb7e2142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55dbeb7eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #26: PyObject_Call + 0xbc (0x55dbeb7f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55dbeb7e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55dbeb7eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55dbeb7de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #30: + 0x150582 (0x55dbeb7f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55dbeb7de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #32: + 0x150582 (0x55dbeb7f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55dbeb7de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #34: + 0x150582 (0x55dbeb7f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55dbeb7de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55dbeb7e5f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55dbeb7f7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #38: + 0x211239 (0x55dbeb8ba239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55dbeb7e6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55dbeb7e23e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55dbeb7eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55dbeb7ddc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55dbeb7eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55dbeb7de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #45: + 0x150582 (0x55dbeb7f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #46: PyObject_Call + 0xbc (0x55dbeb7f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55dbeb7e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #48: + 0x150582 (0x55dbeb7f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #49: PyObject_Call + 0xbc (0x55dbeb7f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55dbeb7e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55dbeb7eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55dbeb7e6007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55dbeb7f7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #54: + 0x211239 (0x55dbeb8ba239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #55: PyObject_Call + 0x207 (0x55dbeb7fa067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55dbeb7e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #57: + 0x150582 (0x55dbeb7f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55dbeb7de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #59: + 0x150582 (0x55dbeb7f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #60: PyObject_Call + 0xbc (0x55dbeb7f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55dbeb7e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #62: + 0x150582 (0x55dbeb7f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #63: PyObject_Call + 0xbc (0x55dbeb7f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank48]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: Traceback (most recent call last): -[default0]:[rank48]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f405c09b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank48]: frame #1: + 0x5b3a23e (0x7f4095bb823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4095bb2c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4095bb2f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4095bb3fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4095b68371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4095b68371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4095b68371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4095b68371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libt[default7]:[rank39]: output = model(**micro_batch) -orch_cpu.so) -[default0]:[rank48]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f405d375189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f405d37c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f405d39b978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #12: + 0x5adc309 (0x7f4095b5a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #13: + 0x5ae6f10 (0x7f4095b64f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #14: + 0x5ae6fa5 (0x7f4095b64fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #15: + 0x5124446 (0x7f40951a2446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #16: + 0x1acf4b8 (0x7f4091b4d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #17: + 0x5aee004 (0x7f4095b6c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #18: + 0x5af36b5 (0x7f4095b716b5 in /fsx/ferdinandmom/miniforge3/e[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -nvs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #19: + 0xd2631e (0x7f40a875b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #20: + 0x47def4 (0x7f40a7eb2ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default0]:[rank48]: frame #21: + 0x1445a6 (0x559bf76f85a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559bf76f1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #23: + 0x150866 (0x559bf7704866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559bf76ed142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559bf76f8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #26: PyObject_Call + 0xbc (0x559bf7704f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559bf76eb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -/bin/python3.10) -[default0]:[rank48]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559bf76f8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559bf76e98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #30: + 0x150582 (0x559bf7704582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559bf76e98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #32: + 0x150582 (0x559bf7704582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559bf76e98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #34: + 0x150582 (0x559bf7704582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559bf76e98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559bf76f0f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559bf7702c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #38: + 0x211239 (0x559bf77c5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559bf76f1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559bf76ed3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559bf76f8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559bf76e8c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559bf76f8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559bf76e98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #45: + 0x150582 (0x559bf7704582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #46: PyObject_Call + 0xbc (0x559bf7704f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: trainer.train(dataloader) -[default0]:[rank48]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559bf76eb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #48: + 0x150582 (0x559bf7704582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #49: PyObject_Call + 0xbc (0x559bf7704f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559bf76eb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559bf76f8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559bf76f1007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559bf7702c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: frame #54: + 0x211239 (0x559bf77c5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #55: PyObject_Call + 0x207 (0x559bf7705067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559bf76eb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #57: + 0x150582 (0x559bf7704582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559bf76e98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #59: + 0x150582 (0x559bf7704582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #60: PyObject_Call + 0xbc (0x559bf7704f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559bf76eb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: frame #62: + 0x150582 (0x559bf7704582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #63: PyObject_Call + 0xbc (0x559bf7704f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank39]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f174ee2d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: frame #1: + 0x5b3a23e (0x7f178894a23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f1788944c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1788944f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1788945fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank39]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17888fa371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17888fa371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17888fa371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17888fa371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f1750107189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank34]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank39]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f175010e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f175012d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #12: + 0x5adc309 (0x7f17888ec309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f471fa0c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank34]: frame #1: + 0x5b3a23e (0x7f475952923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4759523c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #13: + 0x5ae6f10 (0x7f17888f6f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4759523f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4759524fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f47594d9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #14: + 0x5ae6fa5 (0x7f17888f6fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #15: + 0x5124446 (0x7f1787f34446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f47594d9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #16: + 0x1acf4b8 (0x7f17848df4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f47594d9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f47594d9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f4720ce6189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #17: + 0x5aee004 (0x7f17888fe004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4720ced610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4720d0c978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #12: + 0x5adc309 (0x7f47594cb309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #18: + 0x5af36b5 (0x7f17889036b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #19: + 0xd2631e (0x7f179b4ed31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #20: + 0x47def4 (0x7f179ac44ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #21: + 0x1445a6 (0x55a0ca9045a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #13: + 0x5ae6f10 (0x7f47594d5f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a0ca8fda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #23: + 0x150866 (0x55a0ca910866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a0ca8f9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a0ca904a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #26: PyObject_Call + 0xbc (0x55a0ca910f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a0ca8f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a0ca904a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a0ca8f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #14: + 0x5ae6fa5 (0x7f47594d5fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #15: + 0x5124446 (0x7f4758b13446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #16: + 0x1acf4b8 (0x7f47554be4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #17: + 0x5aee004 (0x7f47594dd004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #18: + 0x5af36b5 (0x7f47594e26b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #30: + 0x150582 (0x55a0ca910582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #19: + 0xd2631e (0x7f476c0cc31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a0ca8f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #20: + 0x47def4 (0x7f476b823ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #21: + 0x1445a6 (0x557e48fe15a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557e48fdaa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: frame #32: + 0x150582 (0x55a0ca910582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #23: + 0x150866 (0x557e48fed866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a0ca8f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #34: + 0x150582 (0x55a0ca910582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a0ca8f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a0ca8fcf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a0ca90ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: trainer.train(dataloader) -[default2]:[rank34]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557e48fd6142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557e48fe1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #38: + 0x211239 (0x55a0ca9d1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a0ca8fda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #26: PyObject_Call + 0xbc (0x557e48fedf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x557e48fd42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a0ca8f93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557e48fe1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x557e48fd28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: frame #30: + 0x150582 (0x557e48fed582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x557e48fd28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #32: + 0x150582 (0x557e48fed582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a0ca904a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x557e48fd28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #34: + 0x150582 (0x557e48fed582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a0ca8f4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a0ca904a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x557e48fd28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a0ca8f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557e48fd9f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #45: + 0x150582 (0x55a0ca910582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557e48febc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: frame #46: PyObject_Call + 0xbc (0x55a0ca910f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: frame #38: + 0x211239 (0x557e490ae239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557e48fdaa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a0ca8f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: frame #48: + 0x150582 (0x55a0ca910582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #49: PyObject_Call + 0xbc (0x55a0ca910f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a0ca8f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x557e48fd63e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557e48fe1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557e48fd1c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557e48fe1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default2]:[rank34]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x557e48fd28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #45: + 0x150582 (0x557e48fed582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a0ca904a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: frame #46: PyObject_Call + 0xbc (0x557e48fedf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a0ca8fd007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a0ca90ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: frame #54: + 0x211239 (0x55a0ca9d1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x557e48fd42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #48: + 0x150582 (0x557e48fed582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: frame #55: PyObject_Call + 0x207 (0x55a0ca911067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a0ca8f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #57: + 0x150582 (0x55a0ca910582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #49: PyObject_Call + 0xbc (0x557e48fedf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a0ca8f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x557e48fd42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557e48fe1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557e48fda007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #59: + 0x150582 (0x55a0ca910582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank39]: frame #60: PyObject_Call + 0xbc (0x55a0ca910f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557e48febc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a0ca8f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #54: + 0x211239 (0x557e490ae239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: frame #55: PyObject_Call + 0x207 (0x557e48fee067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x557e48fd42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return func(*args, **kwargs) -[default7]:[rank39]: frame #62: + 0x150582 (0x55a0ca910582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: frame #63: PyObject_Call + 0xbc (0x55a0ca910f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #57: + 0x150582 (0x557e48fed582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x557e48fd28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: frame #59: + 0x150582 (0x557e48fed582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank34]: frame #60: PyObject_Call + 0xbc (0x557e48fedf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x557e48fd42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #62: + 0x150582 (0x557e48fed582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #63: PyObject_Call + 0xbc (0x557e48fedf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank33]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank33]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feec4d5b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank33]: frame #1: + 0x5b3a23e (0x7feefe87823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7feefe872c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7feefe872f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7feefe873fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feefe828371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feefe828371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feefe828371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feefe828371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7feec6035189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7feec603c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7feec605b978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #12: + 0x5adc309 (0x7feefe81a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #13: + 0x5ae6f10 (0x7feefe824f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #14: + 0x5ae6fa5 (0x7feefe824fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #15: + 0x5124446 (0x7feefde62446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #16: + 0x1acf4b8 (0x7feefa80d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #17: + 0x5aee004 (0x7feefe82c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #18: + 0x5af36b5 (0x7feefe8316b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #19: + 0xd2631e (0x7fef1141b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #20: + 0x47def4 (0x7fef10b72ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #21: + 0x1445a6 (0x562dcef205a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562dcef19a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #23: + 0x150866 (0x562dcef2c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x562dcef15142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562dcef20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #26: PyObject_Call + 0xbc (0x562dcef2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x562dcef132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562dcef20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x562dcef118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #30: + 0x150582 (0x562dcef2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x562dcef118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #32: + 0x150582 (0x562dcef2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x562dcef118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #34: + 0x150582 (0x562dcef2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x562dcef118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562dcef18f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562dcef2ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #38: + 0x211239 (0x562dcefed239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562dcef19a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x562dcef153e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562dcef20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562dcef10c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562dcef20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x562dcef118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #45: + 0x150582 (0x562dcef2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #46: PyObject_Call + 0xbc (0x562dcef2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x562dcef132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #48: + 0x150582 (0x562dcef2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #49: PyObject_Call + 0xbc (0x562dcef2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x562dcef132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562dcef20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562dcef19007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562dcef2ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #54: + 0x211239 (0x562dcefed239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #55: PyObject_Call + 0x207 (0x562dcef2d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x562dcef132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #57: + 0x150582 (0x562dcef2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x562dcef118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #59: + 0x150582 (0x562dcef2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #60: PyObject_Call + 0xbc (0x562dcef2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x562dcef132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #62: + 0x150582 (0x562dcef2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #63: PyObject_Call + 0xbc (0x562dcef2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: . This may indicate a possible application crash on rank 0 or a network set up issue. -W0703 09:41:50.468000 140259772696384 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1207762 closing signal SIGTERM -E0703 09:41:51.127000 140259772696384 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 1207763) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:41:50 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1207764) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1207764 -[2]: - time : 2024-07-03_09:41:50 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1207765) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1207765 -[3]: - time : 2024-07-03_09:41:50 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1207766) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1207766 -[4]: - time : 2024-07-03_09:41:50 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1207767) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1207767 -[5]: - time : 2024-07-03_09:41:50 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1207768) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1207768 -[6]: - time : 2024-07-03_09:41:50 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1207769) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1207769 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:41:50 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1207763) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1207763 -============================================================ -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -W0703 09:41:54.927000 140095301584640 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-132.ec2.internal_2472189_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:41:54.968000 139875266402048 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_971206_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:41:54.982000 139725008774912 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1901151_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:41:55.016000 140323911681792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1129659_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:41:55.448000 140100962318144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2472258 closing signal SIGTERM -W0703 09:41:55.449000 140100962318144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2472259 closing signal SIGTERM -W0703 09:41:55.449000 140100962318144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2472260 closing signal SIGTERM -W0703 09:41:55.450000 140100962318144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2472261 closing signal SIGTERM -W0703 09:41:55.450000 140100962318144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2472262 closing signal SIGTERM -W0703 09:41:55.450000 140100962318144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2472263 closing signal SIGTERM -W0703 09:41:55.450000 140100962318144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2472264 closing signal SIGTERM -W0703 09:41:55.453000 140100962318144 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2472265 closing signal SIGTERM -W0703 09:41:55.453000 139880927135552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 971274 closing signal SIGTERM -W0703 09:41:55.453000 139880927135552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 971275 closing signal SIGTERM -W0703 09:41:55.453000 139880927135552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 971276 closing signal SIGTERM -W0703 09:41:55.455000 139880927135552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 971277 closing signal SIGTERM -W0703 09:41:55.456000 139880927135552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 971278 closing signal SIGTERM -W0703 09:41:55.456000 139880927135552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 971279 closing signal SIGTERM -W0703 09:41:55.457000 139880927135552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 971280 closing signal SIGTERM -W0703 09:41:55.457000 139880927135552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 971281 closing signal SIGTERM -W0703 09:41:55.464000 140329572415296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1129727 closing signal SIGTERM -W0703 09:41:55.464000 140329572415296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1129728 closing signal SIGTERM -W0703 09:41:55.464000 140329572415296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1129729 closing signal SIGTERM -W0703 09:41:55.464000 140329572415296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1129730 closing signal SIGTERM -W0703 09:41:55.466000 140329572415296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1129731 closing signal SIGTERM -W0703 09:41:55.467000 140329572415296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1129732 closing signal SIGTERM -W0703 09:41:55.467000 140329572415296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1129733 closing signal SIGTERM -W0703 09:41:55.468000 140329572415296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1129734 closing signal SIGTERM -W0703 09:41:55.466000 139730669508416 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1901219 closing signal SIGTERM -W0703 09:41:55.466000 139730669508416 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1901220 closing signal SIGTERM -W0703 09:41:55.467000 139730669508416 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1901221 closing signal SIGTERM -W0703 09:41:55.468000 139730669508416 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1901222 closing signal SIGTERM -W0703 09:41:55.469000 139730669508416 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1901223 closing signal SIGTERM -W0703 09:41:55.469000 139730669508416 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1901224 closing signal SIGTERM -W0703 09:41:55.470000 139730669508416 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1901225 closing signal SIGTERM -W0703 09:41:55.470000 139730669508416 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1901226 closing signal SIGTERM -W0703 09:41:59.290000 139880927135552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_971206_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:41:59.305000 139880927135552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_971206_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 09:41:59.932000 140095301584640 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-132.ec2.internal_2472189_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:41:59.987000 139725008774912 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1901151_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:42:00.020000 140323911681792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1129659_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -W0703 09:42:03.674000 140100962318144 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-132.ec2.internal_2472189_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:42:03.689000 140100962318144 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-132.ec2.internal_2472189_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-169-132: task 5: Exited with exit code 1 -W0703 09:42:04.971000 140329572415296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1129659_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:42:04.985000 140329572415296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1129659_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() -W0703 09:42:04.991000 139725008774912 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1901151_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 09:42:05.691000 139730669508416 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1901151_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:42:05.705000 139730669508416 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1901151_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-512/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/bench.slurm deleted file mode 100644 index 2f3f53f9d38acc464982ec6952e45dbcbbfc4e98..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/config.yaml deleted file mode 100644 index 6401ea7cef0ae8f5f34659aedb89e24434db7efb..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 16 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 64 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/log.out deleted file mode 100644 index 378fe3ca38c8989d7061861a40488ffcfbc85017..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/log.out +++ /dev/null @@ -1,2698 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:43:54 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:43:58.851000 140061288589120 torch/distributed/run.py:757] -W0703 03:43:58.851000 140061288589120 torch/distributed/run.py:757] ***************************************** -W0703 03:43:58.851000 140061288589120 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:43:58.851000 140061288589120 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.123000 140343363553088 torch/distributed/run.py:757] -W0703 03:43:59.123000 140343363553088 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.123000 140343363553088 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:43:59.123000 140343363553088 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.227000 140513502054208 torch/distributed/run.py:757] -W0703 03:43:59.227000 140513502054208 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.227000 140513502054208 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:43:59.227000 140513502054208 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.268000 139988487444288 torch/distributed/run.py:757] -W0703 03:43:59.268000 139988487444288 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.268000 139988487444288 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:43:59.268000 139988487444288 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.393000 139658385897280 torch/distributed/run.py:757] -W0703 03:43:59.393000 139658385897280 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.393000 139658385897280 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:43:59.393000 139658385897280 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.595000 140698422896448 torch/distributed/run.py:757] -W0703 03:43:59.595000 140698422896448 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.595000 140698422896448 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:43:59.595000 140698422896448 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.685000 139860340873024 torch/distributed/run.py:757] -W0703 03:43:59.685000 139860340873024 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.685000 139860340873024 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:43:59.685000 139860340873024 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.825000 140197727721280 torch/distributed/run.py:757] -W0703 03:43:59.825000 140197727721280 torch/distributed/run.py:757] ***************************************** -W0703 03:43:59.825000 140197727721280 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:43:59.825000 140197727721280 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:44:23 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config: -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: run='%date_%jobid', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: step=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: consumed_train_samples=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: benchmark_csv_path=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp=2, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp=32, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp_engine=, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_mode=, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: expert_parallel_size=1), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50272), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_revision=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_max_length=None), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoint_interval=100000, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: save_initial_state=False, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: log_level_replica='info', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: train_steps=20, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: micro_batch_size=64, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: batch_accumulation_per_replica=16, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: val_check_interval=-1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_val_batches=0, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_test_batches=0), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta1=0.9, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta2=0.95, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: name='adamW'), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: zero_stage=1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: weight_decay=0.01, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: clip_grad=1.0, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_steps=1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_style='linear', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_style='linear', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_steps=19, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: start_training_step=1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_splits='train', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: text_column_name='text'), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_loading_workers=0))], -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64')), -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lighteval=None) -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Model Config: -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50272) -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Building model.. -[default0]:07/03/2024 03:44:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Setting PP block ranks... -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Total number of parameters: 1.22G (2318.88MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Parametrizing model parameters using StandardParametrizator -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=17|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=21|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=20|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=19|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=16|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=24|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=18|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: No checkpoint path provided. -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-78]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-78]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: No checkpoint path provided. -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: No checkpoint path provided. -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=30|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=26|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=27|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=25|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=22|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=28|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=31|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 03:44:42 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=29|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 03:44:42 [INFO|DP=0|PP=1|TP=23|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 03:44:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:44:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:44:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 03:44:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:44:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Using `datasets` library -[default0]:07/03/2024 03:44:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 03:44:45 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:44:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:44:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:44:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: -[default0]:07/03/2024 03:44:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Start training] datetime: 2024-07-03 03:44:47.752090 | mbs: 64 | grad_accum: 16 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:44:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:44:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default5]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=5|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=6|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=4|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=20|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=19|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=5|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=16|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=0|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=6|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=24|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=27|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=29|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=27|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=21|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=22|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=17|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=23|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=2|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:44:48 [WARNING|DP=0|PP=0|TP=1|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=3|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=21|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:44:48 [WARNING|DP=0|PP=0|TP=24|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=25|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=23|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=30|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=3|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=31|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:44:48 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=28|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=26|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=16|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=28|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=22|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=20|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:44:47 [WARNING|DP=0|PP=0|TP=19|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:44:47 [WARNING|DP=0|PP=1|TP=2|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:44:48 [WARNING|DP=0|PP=0|TP=4|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:44:48 [WARNING|DP=0|PP=0|TP=7|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=1|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=18|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:44:48 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:44:48 [WARNING|DP=0|PP=0|TP=18|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:44:48 [WARNING|DP=0|PP=0|TP=29|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=31|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=17|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=30|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:44:48 [WARNING|DP=0|PP=1|TP=7|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:44:53 [WARNING|DP=0|PP=1|TP=25|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:44:53 [WARNING|DP=0|PP=0|TP=26|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:44:53 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:44:58 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-138]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default3]:07/03/2024 03:44:58 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-138]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default3]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default3]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default5]:07/03/2024 03:45:03 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-102]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default5]:07/03/2024 03:45:03 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-102]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default5]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default5]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default1]:07/03/2024 03:45:03 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-102]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:07/03/2024 03:45:03 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-102]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default1]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank12]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank12]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank15]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 595.94 MiB is free. Including non-PyTorch memory, this process has 78.74 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 703.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank8]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank8]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank11]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank11]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 703.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: output = model(**micro_batch) -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank9]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank9]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: return row_linear( -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank14]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 703.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank14]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 595.94 MiB is free. Including non-PyTorch memory, this process has 78.74 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank10]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank10]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 595.94 MiB is free. Including non-PyTorch memory, this process has 78.74 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank13]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank13]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 703.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = model(**micro_batch) -[default1]:[rank25]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default1]:[rank25]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank22]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank22]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 703.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank23]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank23]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 595.94 MiB is free. Including non-PyTorch memory, this process has 78.74 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank21]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank21]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank20]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 595.94 MiB is free. Including non-PyTorch memory, this process has 78.74 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank20]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 703.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank25]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank26]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank25]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 963.94 MiB is free. Including non-PyTorch memory, this process has 78.38 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 775.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank27]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank27]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank27]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 963.94 MiB is free. Including non-PyTorch memory, this process has 78.38 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank28]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank28]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: return row_linear( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 775.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank29]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank29]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return row_linear( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 963.94 MiB is free. Including non-PyTorch memory, this process has 78.38 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank31]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank31]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 963.94 MiB is free. Including non-PyTorch memory, this process has 78.38 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank17]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank17]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 595.94 MiB is free. Including non-PyTorch memory, this process has 78.74 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank30]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank30]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 775.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank16]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank16]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: return row_linear( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank24]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank24]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank24]: return row_linear( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank19]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank19]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 595.94 MiB is free. Including non-PyTorch memory, this process has 78.74 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank18]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank18]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 703.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 11.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank3]: Traceback (most recent call last): -[default5]:[rank5]: Traceback (most recent call last): -[default4]:[rank4]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default3]:[rank3]: trainer.train(dataloader) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return row_linear( -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 225.94 MiB is free. Including non-PyTorch memory, this process has 79.10 GiB memory in use. Of the allocated memory 70.01 GiB is allocated by PyTorch, and 43.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 70.01 GiB is allocated by PyTorch, and 43.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank3]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: return row_linear( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 70.01 GiB is allocated by PyTorch, and 43.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: Traceback (most recent call last): -[default6]:[rank6]: trainer.train(dataloader) -[default7]:[rank7]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: output = model(**micro_batch) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: output = model(**micro_batch) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default1]:[rank1]: output = self.o_proj(attention_output) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 70.01 GiB is allocated by PyTorch, and 43.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 70.01 GiB is allocated by PyTorch, and 43.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: output = self.o_proj(attention_output) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: return row_linear( -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default0]:[rank0]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 225.94 MiB is free. Including non-PyTorch memory, this process has 79.10 GiB memory in use. Of the allocated memory 70.01 GiB is allocated by PyTorch, and 43.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 225.94 MiB is free. Including non-PyTorch memory, this process has 79.10 GiB memory in use. Of the allocated memory 70.01 GiB is allocated by PyTorch, and 43.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -W0703 03:45:26.067000 139860340873024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1444330 closing signal SIGTERM -W0703 03:45:26.068000 139988487444288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 688893 closing signal SIGTERM -W0703 03:45:26.069000 139988487444288 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 688895 closing signal SIGTERM -E0703 03:45:26.197000 140343363553088 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1168741) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1168742) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-78.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1168743) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1168744) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1168745) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1168746) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-78.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1168747) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1168748) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1168741) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 03:45:26.395000 139860340873024 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1444323) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:45:26.402000 139860340873024 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1444251_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:26.427000 139860340873024 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1444251_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:26.454000 139860340873024 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1444251_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-153.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 1444324) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-153.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 1444325) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-153.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 1444326) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 1444327) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-153.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 1444328) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-153.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 1444329) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 1444323) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-78: task 0: Exited with exit code 1 -E0703 03:45:26.693000 139988487444288 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 688889) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:45:26.699000 139988487444288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_688818_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:26.732000 139988487444288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_688818_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -W0703 03:45:26.756000 139988487444288 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_688818_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-138.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 688890) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-138.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 688891) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-138.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 688892) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-138.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 688894) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-138.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 688896) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:45:26 - host : ip-26-0-161-138.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 688889) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-138: task 2: Exited with exit code 1 -W0703 03:45:30.105000 140192066987776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-103.ec2.internal_892905_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:30.384000 140507841320704 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-166-125.ec2.internal_27612_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:30.733000 139652725163776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3788814_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:30.758000 140692762162944 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_906322_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:31.021000 140055627855616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3917924_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:31.067000 140197727721280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 892977 closing signal SIGTERM -W0703 03:45:31.068000 140197727721280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 892978 closing signal SIGTERM -W0703 03:45:31.068000 140197727721280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 892979 closing signal SIGTERM -W0703 03:45:31.068000 140197727721280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 892980 closing signal SIGTERM -W0703 03:45:31.070000 140197727721280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 892981 closing signal SIGTERM -W0703 03:45:31.068000 140513502054208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 27684 closing signal SIGTERM -W0703 03:45:31.070000 140197727721280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 892982 closing signal SIGTERM -W0703 03:45:31.068000 140513502054208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 27685 closing signal SIGTERM -W0703 03:45:31.068000 140513502054208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 27686 closing signal SIGTERM -W0703 03:45:31.070000 139658385897280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3788887 closing signal SIGTERM -W0703 03:45:31.071000 139658385897280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3788888 closing signal SIGTERM -W0703 03:45:31.071000 139658385897280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3788889 closing signal SIGTERM -W0703 03:45:31.069000 140513502054208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 27687 closing signal SIGTERM -W0703 03:45:31.070000 140513502054208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 27688 closing signal SIGTERM -W0703 03:45:31.071000 139658385897280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3788890 closing signal SIGTERM -W0703 03:45:31.071000 140197727721280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 892983 closing signal SIGTERM -W0703 03:45:31.071000 140197727721280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 892984 closing signal SIGTERM -W0703 03:45:31.070000 140513502054208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 27689 closing signal SIGTERM -W0703 03:45:31.070000 140513502054208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 27690 closing signal SIGTERM -W0703 03:45:31.071000 140513502054208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 27691 closing signal SIGTERM -W0703 03:45:31.073000 139658385897280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3788891 closing signal SIGTERM -W0703 03:45:31.073000 139658385897280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3788892 closing signal SIGTERM -W0703 03:45:31.075000 139658385897280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3788893 closing signal SIGTERM -W0703 03:45:31.075000 139658385897280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3788894 closing signal SIGTERM -W0703 03:45:31.076000 140698422896448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 906394 closing signal SIGTERM -W0703 03:45:31.076000 140698422896448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 906395 closing signal SIGTERM -W0703 03:45:31.076000 140698422896448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 906396 closing signal SIGTERM -W0703 03:45:31.077000 140061288589120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3917997 closing signal SIGTERM -W0703 03:45:31.078000 140061288589120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3917998 closing signal SIGTERM -W0703 03:45:31.077000 140698422896448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 906397 closing signal SIGTERM -W0703 03:45:31.078000 140061288589120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3917999 closing signal SIGTERM -W0703 03:45:31.078000 140061288589120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3918000 closing signal SIGTERM -W0703 03:45:31.079000 140698422896448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 906398 closing signal SIGTERM -W0703 03:45:31.079000 140698422896448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 906399 closing signal SIGTERM -W0703 03:45:31.080000 140061288589120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3918001 closing signal SIGTERM -W0703 03:45:31.080000 140698422896448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 906400 closing signal SIGTERM -W0703 03:45:31.080000 140061288589120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3918002 closing signal SIGTERM -W0703 03:45:31.080000 140698422896448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 906401 closing signal SIGTERM -W0703 03:45:31.082000 140061288589120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3918003 closing signal SIGTERM -W0703 03:45:31.083000 140061288589120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3918004 closing signal SIGTERM -W0703 03:45:35.109000 140192066987776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-103.ec2.internal_892905_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:35.388000 140507841320704 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-166-125.ec2.internal_27612_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:35.737000 139652725163776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3788814_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:35.762000 140692762162944 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_906322_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:36.026000 140055627855616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3917924_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:39.813000 140197727721280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_892905_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:39.819000 140698422896448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_906322_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:39.828000 140197727721280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_892905_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -W0703 03:45:39.830000 140698422896448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_906322_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 1: Exited with exit code 1 -W0703 03:45:40.393000 140507841320704 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-166-125.ec2.internal_27612_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:40.741000 139652725163776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3788814_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:41.030000 140055627855616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3917924_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:43.219000 140513502054208 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_27612_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:43.230000 140513502054208 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_27612_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-166-125: task 4: Exited with exit code 1 -W0703 03:45:45.327000 140061288589120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3917924_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:45.326000 139658385897280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3788814_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:45.337000 140061288589120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3917924_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:45:45.336000 139658385897280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3788814_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-64/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/bench.slurm deleted file mode 100644 index 44597ba36a2ae9dd4b70d18e6ba39f1c341260c2..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8 llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/config.yaml b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/config.yaml deleted file mode 100644 index d6405079368c6ae537bcbf49c89b3b60ade70d1b..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 128 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 8 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/log.out b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/log.out deleted file mode 100644 index a69c0d8a64ffe8e6c7fccfbe5da461d26d513035..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/log.out +++ /dev/null @@ -1,5968 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:50:25 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:50:33.555000 140198859532096 torch/distributed/run.py:757] -W0703 09:50:33.555000 140198859532096 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.555000 140198859532096 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:50:33.555000 140198859532096 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.628000 140289820702528 torch/distributed/run.py:757] -W0703 09:50:33.628000 140289820702528 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.628000 140289820702528 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:50:33.628000 140289820702528 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.664000 140292326725440 torch/distributed/run.py:757] -W0703 09:50:33.664000 140292326725440 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.664000 140292326725440 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:50:33.664000 140292326725440 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.681000 140397348423488 torch/distributed/run.py:757] -W0703 09:50:33.681000 140397348423488 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.681000 140397348423488 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:50:33.681000 140397348423488 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.701000 140645926610752 torch/distributed/run.py:757] -W0703 09:50:33.701000 140645926610752 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.701000 140645926610752 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:50:33.701000 140645926610752 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.738000 140250535159616 torch/distributed/run.py:757] -W0703 09:50:33.738000 140250535159616 torch/distributed/run.py:757] ***************************************** -W0703 09:50:33.738000 140250535159616 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:50:33.738000 140250535159616 torch/distributed/run.py:757] ***************************************** -W0703 09:50:34.325000 140455991691072 torch/distributed/run.py:757] -W0703 09:50:34.325000 140455991691072 torch/distributed/run.py:757] ***************************************** -W0703 09:50:34.325000 140455991691072 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:50:34.325000 140455991691072 torch/distributed/run.py:757] ***************************************** -W0703 09:50:34.329000 140203763963712 torch/distributed/run.py:757] -W0703 09:50:34.329000 140203763963712 torch/distributed/run.py:757] ***************************************** -W0703 09:50:34.329000 140203763963712 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:50:34.329000 140203763963712 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:50:58 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Config: -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: run='%date_%jobid', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: seed=42, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: step=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: consumed_train_samples=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: benchmark_csv_path=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pp=2, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tp=32, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pp_engine=, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tp_mode=, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: expert_parallel_size=1), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: eos_token_id=2, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hidden_act='silu', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hidden_size=2048, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: initializer_range=0.02, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: intermediate_size=4096, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: is_llama_config=True, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: max_position_embeddings=4096, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_attention_heads=32, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_hidden_layers=24, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_key_value_heads=32, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pad_token_id=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pretraining_tp=1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rope_scaling=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rope_theta=10000.0, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tie_word_embeddings=True, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: use_cache=True, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: vocab_size=50272), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tokenizer_revision=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tokenizer_max_length=None), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: checkpoint_interval=100000, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: save_initial_state=False, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: log_level_replica='info', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: train_steps=20, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: micro_batch_size=8, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: batch_accumulation_per_replica=128, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: val_check_interval=-1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: limit_val_batches=0, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: limit_test_batches=0), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: adam_beta1=0.9, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: adam_beta2=0.95, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: name='adamW'), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: zero_stage=1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: weight_decay=0.01, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: clip_grad=1.0, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_warmup_steps=1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_warmup_style='linear', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_decay_style='linear', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_decay_steps=19, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: start_training_step=1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hf_dataset_splits='train', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: text_column_name='text'), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: seed=42, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_loading_workers=0))], -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8')), -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lighteval=None) -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Model Config: -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: eos_token_id=2, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hidden_act='silu', -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hidden_size=2048, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: initializer_range=0.02, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: intermediate_size=4096, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: is_llama_config=True, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: max_position_embeddings=4096, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_attention_heads=32, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_hidden_layers=24, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_key_value_heads=32, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pad_token_id=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pretraining_tp=1, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rope_scaling=None, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rope_theta=10000.0, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tie_word_embeddings=True, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: use_cache=True, -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: vocab_size=50272) -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Building model.. -[default0]:07/03/2024 09:50:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Setting PP block ranks... -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=24|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=24|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=24|ip-26-0-174-36]: No checkpoint path provided. -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=28|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=28|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=28|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=0|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=0|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=0|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=3|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=3|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=3|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-43]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-43]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-43]: No checkpoint path provided. -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=5|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=5|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=5|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=1|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=1|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=1|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=8|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=8|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=8|ip-26-0-173-202]: No checkpoint path provided. -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=11|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-43]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-43]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-43]: No checkpoint path provided. -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=2|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=11|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=11|ip-26-0-173-202]: No checkpoint path provided. -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=2|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=2|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-43]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-43]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-43]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=9|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-43]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-43]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-43]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-43]: No checkpoint path provided. -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-43]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-43]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=9|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=9|ip-26-0-173-202]: No checkpoint path provided. -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-43]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-43]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-43]: No checkpoint path provided. -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=22|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=22|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=22|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=16|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=6|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=16|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=16|ip-26-0-162-233]: No checkpoint path provided. -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=18|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=18|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=18|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=4|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=4|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=4|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=6|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=6|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=26|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=26|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=26|ip-26-0-174-36]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=19|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=17|ip-26-0-173-7]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=17|ip-26-0-173-7]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=17|ip-26-0-173-7]: No checkpoint path provided. -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=19|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=19|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=21|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=21|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=21|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=20|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=20|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=20|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=15|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=15|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=15|ip-26-0-173-202]: No checkpoint path provided. -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=14|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=14|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=14|ip-26-0-173-202]: No checkpoint path provided. -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=12|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=12|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=12|ip-26-0-173-202]: No checkpoint path provided. -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=13|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=13|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=10|ip-26-0-173-202]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=13|ip-26-0-173-202]: No checkpoint path provided. -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=10|ip-26-0-173-202]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=23|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=10|ip-26-0-173-202]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=25|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=17|ip-26-0-162-233]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=23|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=23|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=17|ip-26-0-162-233]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=17|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=25|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Total number of parameters: 1.22G (2318.88MiB) -[default1]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=25|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=30|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=30|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=27|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Parametrizing model parameters using StandardParametrizator -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=27|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=30|ip-26-0-174-36]: No checkpoint path provided. -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=27|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=16|ip-26-0-173-7]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=16|ip-26-0-173-7]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=22|ip-26-0-173-7]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=22|ip-26-0-173-7]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=22|ip-26-0-173-7]: No checkpoint path provided. -[default0]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=16|ip-26-0-173-7]: No checkpoint path provided. -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=23|ip-26-0-173-7]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=23|ip-26-0-173-7]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=23|ip-26-0-173-7]: No checkpoint path provided. -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-153]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-153]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=31|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=31|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=31|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=29|ip-26-0-174-36]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=29|ip-26-0-174-36]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=29|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=21|ip-26-0-173-7]: Local number of parameters: 16.4M (31.22MiB) -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=21|ip-26-0-173-7]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default5]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=21|ip-26-0-173-7]: No checkpoint path provided. -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=19|ip-26-0-173-7]: Local number of parameters: 16.4M (31.22MiB) -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=19|ip-26-0-173-7]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default3]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=19|ip-26-0-173-7]: No checkpoint path provided. -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=18|ip-26-0-173-7]: Local number of parameters: 16.4M (31.22MiB) -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=18|ip-26-0-173-7]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=20|ip-26-0-173-7]: Local number of parameters: 16.4M (31.22MiB) -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=20|ip-26-0-173-7]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default4]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=20|ip-26-0-173-7]: No checkpoint path provided. -[default2]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=18|ip-26-0-173-7]: No checkpoint path provided. -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=7|ip-26-0-171-102]: Local number of parameters: 16.4M (31.22MiB) -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=7|ip-26-0-171-102]: [After model building] Memory usage: 41.23MiB. Peak allocated: 43.26MiB Peak reserved: 58.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=1|TP=7|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-43]: Local number of parameters: 21.6M (41.25MiB) -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-43]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default6]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-43]: No checkpoint path provided. -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-43]: Local number of parameters: 21.6M (41.25MiB) -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-43]: [After model building] Memory usage: 55.26MiB. Peak allocated: 57.29MiB Peak reserved: 72.00MiB -[default7]:07/03/2024 09:51:17 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-43]: No checkpoint path provided. -[default0]:07/03/2024 09:51:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:51:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:51:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [ZeRO sharding] DP Rank 0 has 21.6M out of 21.6M (100.00%) params' optimizer states -[default0]:07/03/2024 09:51:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:51:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Using `datasets` library -[default0]:07/03/2024 09:51:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:51:20 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:51:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:51:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:51:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: -[default0]:07/03/2024 09:51:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Start training] datetime: 2024-07-03 09:51:23.025000 | mbs: 8 | grad_accum: 128 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:51:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:51:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 220.25MiB. Peak allocated 220.25MiB. Peak reserved: 240.00MiB -[default5]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=5|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=16|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=23|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=6|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=28|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=0|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=29|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=1|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=2|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=8|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=11|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=25|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=28|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=26|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=6|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=16|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=17|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=1|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=20|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=5|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=12|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=22|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=7|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=29|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=18|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=19|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=24|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=3|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=27|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=18|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=2|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=3|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=21|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=23|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=10|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=30|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=13|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=25|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=31|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=21|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=20|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=7|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=31|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=30|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=9|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=24|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=26|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=19|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=27|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=15|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=4|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=14|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:51:23 [WARNING|DP=0|PP=1|TP=4|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=22|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:51:23 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:51:24 [WARNING|DP=0|PP=0|TP=17|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank26]: dist.recv( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank24]: dist.recv( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank19]: dist.recv( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0ce83b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe0cfb14c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe0cfb19a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe0cfb1adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe11b5b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fe1205fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe1203c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0ce83b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe0cfb14c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe0cfb19a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe0cfb1adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe11b5b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fe1205fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe1203c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0ce83b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fe0cf79e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fe11b5b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fe1205fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fe1203c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: trainer.train(dataloader) -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default0]:[rank0]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: result = loss.backward() -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: result = loss.backward() -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: output = model(**micro_batch) -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank6]: dist.recv( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa399d5a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa39b033c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa39b038a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa39b039dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa3e6ad2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa3ebb19609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa3eb8e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa399d5a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa39b033c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa39b038a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa39b039dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa3e6ad2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa3ebb19609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa3eb8e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa399d5a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fa39acbd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fa3e6ad2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fa3ebb19609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fa3eb8e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16acedf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f16ae1b8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f16ae1bda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f16ae1bedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f16f9c57e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f16fec9e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f16fea69353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16acedf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f16ae1b8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f16ae1bda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f16ae1bedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f16f9c57e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f16fec9e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f16fea69353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16acedf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f16ade42119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f16f9c57e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f16fec9e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f16fea69353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc10923f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc10a518c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc10a51da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc10a51edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc155fb7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc15affe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fc15adc9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default2]: -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc10923f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc10a518c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc10a51da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank15]: return user_fn(self, *args) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc10a51edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc155fb7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:frame #5: + 0x8609 (0x7fc15affe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:frame #6: clone + 0x43 (0x7fc15adc9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc10923f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fc10a1a2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fc155fb7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default2]:frame #3: + 0x8609 (0x7fc15affe609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:frame #4: clone + 0x43 (0x7fc15adc9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f734fad2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7350dabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7350db0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7350db1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f739c84ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f73a1891609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f73a165c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f734fad2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7350dabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7350db0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7350db1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f739c84ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f73a1891609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f73a165c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f734fad2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f7350a35119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f739c84ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f73a1891609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f73a165c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0efe346897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0eff61fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0eff624a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0eff625dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0f4b0bee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0f50105609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0f4fed0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0efe346897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0eff61fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0eff624a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0eff625dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0f4b0bee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0f50105609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0f4fed0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0efe346897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f0eff2a9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f0f4b0bee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f0f50105609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f0f4fed0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f809928b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f809a564c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f809a569a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f809a56adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f80e6003e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f80eb04a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f80eae15353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f809928b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f809a564c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f809a569a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f809a56adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f80e6003e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f80eb04a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f80eae15353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f809928b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f809a1ee119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f80e6003e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f80eb04a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f80eae15353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94af8b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94b0b91c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94b0b96a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94b0b97dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f94fc630e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f9501677609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f9501442353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94af8b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94b0b91c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94b0b96a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94b0b97dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f94fc630e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f9501677609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f9501442353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94af8b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f94b081b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f94fc630e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f9501677609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f9501442353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fafc8a62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fafc9d3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fafc9d40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fafc9d41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fb0157dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb01a821609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fb01a5ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fafc8a62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fafc9d3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fafc9d40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fafc9d41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fb0157dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb01a821609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fb01a5ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fafc8a62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fafc99c5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fb0157dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fb01a821609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fb01a5ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98a1564897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f98a283dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f98a2842a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f98a2843dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f98ee2dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f98f3323609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f98f30ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98a1564897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f98a283dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f98a2842a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f98a2843dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f98ee2dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f98f3323609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f98f30ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98a1564897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f98a24c7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f98ee2dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f98f3323609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f98f30ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa1e4c88897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa1e5f61c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa1e5f66a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa1e5f67dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa231a00e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa236a47609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa236812353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa1e4c88897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa1e5f61c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa1e5f66a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa1e5f67dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa231a00e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa236a47609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa236812353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa1e4c88897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fa1e5beb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fa231a00e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fa236a47609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fa236812353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd974d67897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd976040c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd976045a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd976046dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd9c1adfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd9c6b26609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd9c68f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd974d67897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd976040c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd976045a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd976046dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd9c1adfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd9c6b26609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd9c68f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd974d67897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fd975cca119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fd9c1adfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fd9c6b26609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fd9c68f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efed0cdb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efed1fb4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efed1fb9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efed1fbadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7eff1da53e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7eff22a9a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7eff22865353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efed0cdb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efed1fb4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efed1fb9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efed1fbadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7eff1da53e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7eff22a9a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7eff22865353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efed0cdb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7efed1c3e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7eff1da53e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7eff22a9a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7eff22865353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3040b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff30538bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff305390a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff305391dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff350e2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff355e71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff355c3c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98fcda2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3040b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f98fe07bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f98fe080a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff30538bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff305390a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff305391dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff350e2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff355e71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #6: clone + 0x43 (0x7ff355c3c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3040b2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7ff305015119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f98fe081dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f9949b1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70a6c1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70a7ef3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f70a7ef8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70a7ef9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f70f3992e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[de[default3]:frame #5: + 0x8609 (0x7f994eb61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f994e92c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -fault5]:frame #5: + 0x8609 (0x7f70f89d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f70f87a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70a6c1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70a7ef3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCC[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -L::watchdogHandler() + 0x1a0 (0x7f70a7ef8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70a7ef9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f70f3992e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f70f89d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f70f87a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:frame #2: + 0xd3e95 (0x7ff350e2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70a6c1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f70a7b7d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f70f3992e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f70f89d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f70f87a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:frame #3: + 0x8609 (0x7ff355e71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7ff355c3c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98fcda2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f98fe07bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f98fe080a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f98fe081dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f9949b1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f994eb61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f994e92c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98fcda2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f98fdd05119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f9949b1ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f994eb61609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f994e92c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank30]: dist.recv( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42ba8ec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f42bbbc5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f42bbbcaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f42bbbcbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4307664e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f430c6ab609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f430c476353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42ba8ec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f42bbbc5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f42bbbcaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f42bbbcbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4307664e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f430c6ab609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f430c476353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f42ba8ec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f42bb84f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f4307664e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f430c6ab609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f430c476353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/para[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f306e62a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f306f903c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f306f908a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f306f909dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f30bb3a2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f30c03e9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f30c01b4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f306e62a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f306f903c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f306f908a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f306f909dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f30bb3a2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f30c03e9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f30c01b4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f306e62a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f306f58d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f30bb3a2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f30c03e9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f30c01b4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -llel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f55e4a64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f55e5d3dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f55e5d42a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f55e5d43dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f56317dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5636823609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f56365ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f55e4a64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f55e5d3dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f55e5d42a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f55e5d43dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f56317dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5636823609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f56365ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f55e4a64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f55e59c7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f56317dce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f5636823609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f56365ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]: Traceback (most recent call last): -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f686c230897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f686d509c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f686d50ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f686d50fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f68b8fa8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f68bdfef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f68bddba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]: -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f686c230897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f686d509c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f686d50ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank23]: torch.autograd.backward( -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f686d50fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:frame #4: + 0xd3e95 (0x7f68b8fa8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:[rank23]: _engine_run_backward( -[default5]:frame #5: + 0x8609 (0x7f68bdfef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:frame #6: clone + 0x43 (0x7f68bddba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f686c230897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f686d193119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:frame #2: + 0xd3e95 (0x7f68b8fa8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:frame #3: + 0x8609 (0x7f68bdfef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:frame #4: clone + 0x43 (0x7f68bddba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c01271897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2c0254ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2c0254fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2c02550dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2c4dfe9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2c53030609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2c52dfb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c01271897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2c0254ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2c0254fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2c02550dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2c4dfe9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f2c53030609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2c52dfb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c01271897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f2c021d4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f2c4dfe9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f2c53030609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f2c52dfb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe38daef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe38edc8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe38edcda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe38edcedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fe3da867e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fe3df8ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fe3df679353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe38daef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe38edc8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe38edcda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe38edcedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fe3da867e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fe3df8ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fe3df679353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe38daef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fe38ea52119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fe3da867e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fe3df8ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fe3df679353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f099c2c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f099d59dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f099d5a2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f099d5a3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f09e903ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f09ee083609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f09ede4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f099c2c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f099d59dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f099d5a2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f099d5a3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f09e903ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f09ee083609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f09ede4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f099c2c4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f099d227119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f09e903ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f09ee083609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f09ede4e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0af6e54897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0af812dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0af8132a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0af8133dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0b43bcce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0b48c13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0b489de353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0af6e54897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0af812dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0af8132a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0af8133dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0b43bcce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0b48c13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0b489de353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0af6e54897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f0af7db7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f0b43bcce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f0b48c13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f0b489de353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a8ef48897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a90221c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a90226a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a90227dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5adbcc0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5ae0d07609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5ae0ad2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a8ef48897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5a90221c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5a90226a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5a90227dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5adbcc0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5ae0d07609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5ae0ad2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5a8ef48897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f5a8feab119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f5adbcc0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f5ae0d07609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f5ae0ad2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc32394e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc324c27c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc324c2ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc324c2ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc3706c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc37570d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc3754d8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc32394e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc324c27c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc324c2ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc324c2ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc3706c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc37570d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc3754d8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc32394e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fc3248b1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fc3706c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fc37570d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fc3754d8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f5a2d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f5b5b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f5b5b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f5b5b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3fa7051e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3fac098609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3fabe63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f5a2d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f5b5b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f5b5b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f5b5b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3fa7051e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3fac098609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3fabe63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]: -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f5a2d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f3f5b23c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f3fa7051e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f3fac098609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f3fabe63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3191c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff31a49fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff31a4a4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff31a4a5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff365f3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff36af85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff36ad50353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3191c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff31a49fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff31a4a4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff31a4a5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff365f3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff36af85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff36ad50353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff3191c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7ff31a129119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7ff365f3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7ff36af85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7ff36ad50353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3da5b1b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3da6df4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3da6df9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3da6dfadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3df2893e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3df78da609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3df76a5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3da5b1b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3da6df4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3da6df9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3da6dfadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3df2893e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3df78da609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3df76a5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3da5b1b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3da6a7e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f3df2893e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f3df78da609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f3df76a5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3666172897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f366744bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3667450a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3667451dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f36b2eeae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f36b7f31609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f36b7cfc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3666172897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f366744bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3667450a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3667451dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f36b2eeae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f36b7f31609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f36b7cfc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3666172897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f36670d5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f36b2eeae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f36b7f31609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f36b7cfc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9e1e10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc9e30e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc9e30eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc9e30efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fca2eb88e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fca33bcf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fca3399a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9e1e10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc9e30e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc9e30eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc9e30efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fca2eb88e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fca33bcf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fca3399a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9e1e10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fc9e2d73119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fca2eb88e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fca33bcf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fca3399a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f86bb706897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f86bc9dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f86bc9e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f86bc9e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f870847ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f870d4c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f870d290353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f86bb706897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f86bc9dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f86bc9e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f86bc9e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f870847ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f870d4c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f870d290353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f86bb706897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f86bc669119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f870847ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f870d4c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f870d290353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fed837fc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fed84ad5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fed84adaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fed84adbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fedd0574e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fedd55bb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fedd5386353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fed837fc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fed84ad5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fed84adaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fed84adbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fedd0574e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fedd55bb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fedd5386353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fed837fc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fed8475f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fedd0574e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fedd55bb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fedd5386353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fedba51a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fedbb7f3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fedbb7f8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fedbb7f9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fee07292e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fee0c2d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fee0c0a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fedba51a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fedbb7f3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fedbb7f8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fedbb7f9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fee07292e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fee0c2d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fee0c0a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fedba51a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fedbb47d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fee07292e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fee0c2d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fee0c0a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5ada588897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5adb861c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5adb866a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5adb867dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f5b27300e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f5b2c347609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f5b2c112353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5ada588897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5adb861c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5adb866a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5adb867dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f5b27300e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f5b2c347609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f5b2c112353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5ada588897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f5adb4eb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f5b27300e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f5b2c347609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f5b2c112353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd484b5f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd485e38c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd485e3da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd485e3edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd4d18d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd4d691e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd4d66e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd484b5f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd485e38c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd485e3da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd485e3edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd4d18d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd4d691e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd4d66e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd484b5f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd485ac2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd4d18d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd4d691e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd4d66e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70906be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7091997c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f709199ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f709199ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f70dd436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f70e247d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f70e2248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70906be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7091997c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f709199ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f709199ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f70dd436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f70e247d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f70e2248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70906be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f7091621119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f70dd436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f70e247d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f70e2248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe6f460c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe6f58e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe6f58eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe6f58ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fe741384e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fe7463cb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fe746196353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe6f460c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe6f58e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe6f58eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe6f58ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fe741384e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fe7463cb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fe746196353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe6f460c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fe6f556f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fe741384e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fe7463cb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fe746196353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd02caa5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd02dd7ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd02dd83a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd02dd84dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd07981de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd07e864609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd07e62f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd02caa5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd02dd7ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd02dd83a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd02dd84dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd07981de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd07e864609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd07e62f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd02caa5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd02da08119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd07981de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd07e864609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd07e62f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72fbae1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f72fcdbac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f72fcdbfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f72fcdc0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f7348859e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f734d8a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f734d66b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72fbae1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f72fcdbac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f72fcdbfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f72fcdc0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f7348859e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f734d8a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f734d66b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72fbae1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f72fca44119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f7348859e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f734d8a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f734d66b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b5fefb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6b611d4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6b611d9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6b611dadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f6bacc73e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f6bb1cba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f6bb1a85353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b5fefb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6b611d4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6b611d9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6b611dadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f6bacc73e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f6bb1cba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f6bb1a85353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6b5fefb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f6b60e5e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f6bacc73e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f6bb1cba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f6bb1a85353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02193d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f021a6abc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f021a6b0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f021a6b1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f026614ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f026b191609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f026af5c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02193d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f021a6abc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f021a6b0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f021a6b1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f026614ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f026b191609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f026af5c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02193d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f021a335119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f026614ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f026b191609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f026af5c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f549c0af897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f549d388c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f549d38da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f549d38edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f54e8e27e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f54ede6e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f54edc39353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f549c0af897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f549d388c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f549d38da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f549d38edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f54e8e27e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f54ede6e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f54edc39353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f549c0af897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f549d012119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f54e8e27e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f54ede6e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f54edc39353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6d0b8a9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6d0cb82c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6d0cb87a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6d0cb88dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f6d58621e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f6d5d668609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f6d5d433353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6d0b8a9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6d0cb82c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6d0cb87a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6d0cb88dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f6d58621e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f6d5d668609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f6d5d433353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6d0b8a9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f6d0c80c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f6d58621e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f6d5d668609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f6d5d433353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87d587a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f87d6b53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f87d6b58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f87d6b59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f88225f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8827639609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8827404353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87d587a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f87d6b53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f87d6b58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f87d6b59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f88225f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8827639609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8827404353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87d587a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f87d67dd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f88225f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f8827639609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f8827404353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77d55bb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f77d6894c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f77d6899a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f77d689adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7822333e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f782737a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7827145353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77d55bb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f77d6894c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f77d6899a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f77d689adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7822333e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f782737a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7827145353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77d55bb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f77d651e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f7822333e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f782737a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f7827145353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f934b800897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f934cad9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f934cadea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f934cadfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9398578e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f939d5bf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f939d38a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f934b800897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f934cad9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f934cadea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f934cadfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9398578e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f939d5bf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f939d38a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f934b800897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f934c763119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f9398578e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f939d5bf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f939d38a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc6ae147897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc6af420c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc6af425a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc6af426dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fc6faebfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fc6fff06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fc6ffcd1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc6ae147897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc6af420c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc6af425a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc6af426dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fc6faebfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fc6fff06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fc6ffcd1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc6ae147897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fc6af0aa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fc6faebfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fc6fff06609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fc6ffcd1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04dfc41897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04e0f1ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04e0f1fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04e0f20dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f052c9b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f0531a00609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f05317cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34182b5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f341958ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3419593a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3419594dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f346502de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f346a074609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3469e3f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34182b5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f341958ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3419593a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3419594dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #4: + 0xd3e95 (0x7f346502de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f346a074609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3469e3f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default2]: -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04dfc41897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34182b5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04e0f1ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #1: + 0xe32119 (0x7f3419218119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04e0f1fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04e0f20dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f052c9b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f0531a00609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f05317cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04dfc41897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f04e0ba4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f346502de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #2: + 0xd3e95 (0x7f052c9b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f346a074609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f3469e3f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #3: + 0x8609 (0x7f0531a00609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f05317cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2b7081897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff2b835ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff2b835fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff2b8360dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff303df9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff308e40609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff308c0b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2b7081897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff2b835ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff2b835fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff2b8360dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff303df9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff308e40609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff308c0b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff2b7081897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7ff2b7fe4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7ff303df9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7ff308e40609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7ff308c0b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f50c2fea897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f50c42c3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f50c42c8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f50c42c9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f510fd62e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5114da9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f5114b74353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f50c2fea897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f50c42c3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f50c42c8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f50c42c9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f510fd62e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5114da9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f5114b74353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f50c2fea897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f50c3f4d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f510fd62e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f5114da9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f5114b74353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c4b12a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9c4c403c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9c4c408a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9c4c409dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f9c97ea2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f9c9cee9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f9c9ccb4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c4b12a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9c4c403c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9c4c408a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9c4c409dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f9c97ea2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f9c9cee9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f9c9ccb4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c4b12a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f9c4c08d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f9c97ea2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f9c9cee9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f9c9ccb4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcebaff0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcebc2c9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcebc2cea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcebc2cfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fcf07d68e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fcf0cdaf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fcf0cb7a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcebaff0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcebc2c9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcebc2cea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcebc2cfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fcf07d68e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fcf0cdaf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fcf0cb7a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcebaff0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fcebbf53119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fcf07d68e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fcf0cdaf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fcf0cb7a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1608793897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1609a6cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1609a71a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1609a72dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f165550be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f165a552609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f165a31d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1608793897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1609a6cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1609a71a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1609a72dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f165550be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f165a552609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f165a31d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1608793897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f16096f6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f165550be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f165a552609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f165a31d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9e0d8aa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9e0eb83c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9e0eb88a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9e0eb89dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f9e5a622e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f9e5f669609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f9e5f434353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9e0d8aa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9e0eb83c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9e0eb88a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9e0eb89dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f9e5a622e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f9e5f669609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f9e5f434353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9e0d8aa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f9e0e80d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f9e5a622e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f9e5f669609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f9e5f434353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d3b693897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9d3c96cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9d3c971a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9d3c972dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f9d8840be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f9d8d452609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f9d8d21d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d3b693897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9d3c96cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9d3c971a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9d3c972dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f9d8840be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f9d8d452609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f9d8d21d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9d3b693897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f9d3c5f6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f9d8840be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f9d8d452609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f9d8d21d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa9fc881897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa9fdb5ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa9fdb5fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa9fdb60dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7faa495f9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7faa4e640609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7faa4e40b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa9fc881897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa9fdb5ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa9fdb5fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa9fdb60dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7faa495f9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7faa4e640609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7faa4e40b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa9fc881897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fa9fd7e4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7faa495f9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7faa4e640609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7faa4e40b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9df614a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9df7423c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9df7428a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9df7429dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9e42ec2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9e47f09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9e47cd4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9df614a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9df7423c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9df7428a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9df7429dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9e42ec2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9e47f09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9e47cd4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9df614a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f9df70ad119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f9e42ec2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f9e47f09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f9e47cd4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c5681e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c57af7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c57afca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c57afddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1ca3596e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1ca85dd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1ca83a8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c5681e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c57af7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c57afca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c57afddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1ca3596e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1ca85dd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1ca83a8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c5681e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f1c57781119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f1ca3596e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f1ca85dd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f1ca83a8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdb2eb36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdb2fe0fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdb2fe14a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdb2fe15dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default4]:frame #4: + 0xd3e95 (0x7fdb7b8aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:frame #5: + 0x8609 (0x7fdb808f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:frame #6: clone + 0x43 (0x7fdb806c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdb2eb36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdb2fe0fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdb2fe14a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdb2fe15dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:frame #4: + 0xd3e95 (0x7fdb7b8aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:frame #5: + 0x8609 (0x7fdb808f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:frame #6: clone + 0x43 (0x7fdb806c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdb2eb36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fdb2fa99119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fdb7b8aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fdb808f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fdb806c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7e77096897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7e7836fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7e78374a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7e78375dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7ec3e0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f7ec8e55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f7ec8c20353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7e77096897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7e7836fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7e78374a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7e78375dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7ec3e0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f7ec8e55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f7ec8c20353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7e77096897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f7e77ff9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f7ec3e0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f7ec8e55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f7ec8c20353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa60618e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa607467c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa60746ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa60746ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa652f06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa657f4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa657d18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa60618e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa607467c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa60746ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa60746ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa652f06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa657f4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa657d18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa60618e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fa6070f1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fa652f06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fa657f4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fa657d18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f18c786a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f18c8b43c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f18c8b48a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f18c8b49dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f19145e2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1919629609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f19193f4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f18c786a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f18c8b43c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f18c8b48a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f18c8b49dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f19145e2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1919629609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f19193f4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f18c786a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f18c87cd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f19145e2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f1919629609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f19193f4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -W0703 10:01:51.078000 140645926610752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3842469 closing signal SIGTERM -W0703 10:01:51.080000 140645926610752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3842470 closing signal SIGTERM -W0703 10:01:51.080000 140645926610752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3842471 closing signal SIGTERM -W0703 10:01:51.080000 140645926610752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3842472 closing signal SIGTERM -W0703 10:01:51.080000 140645926610752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3842473 closing signal SIGTERM -W0703 10:01:51.080000 140645926610752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3842474 closing signal SIGTERM -W0703 10:01:51.080000 140645926610752 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3842475 closing signal SIGTERM -E0703 10:01:51.248000 140198859532096 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1494431) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:01:51 - host : ip-26-0-161-153.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1494432) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1494432 -[2]: - time : 2024-07-03_10:01:51 - host : ip-26-0-161-153.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1494433) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1494433 -[3]: - time : 2024-07-03_10:01:51 - host : ip-26-0-161-153.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1494434) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1494434 -[4]: - time : 2024-07-03_10:01:51 - host : ip-26-0-161-153.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1494435) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1494435 -[5]: - time : 2024-07-03_10:01:51 - host : ip-26-0-161-153.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1494436) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1494436 -[6]: - time : 2024-07-03_10:01:51 - host : ip-26-0-161-153.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1494437) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1494437 -[7]: - time : 2024-07-03_10:01:51 - host : ip-26-0-161-153.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1494438) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1494438 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:01:51 - host : ip-26-0-161-153.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1494431) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1494431 -============================================================ -E0703 10:01:51.332000 140292326725440 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 899924) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:01:51 - host : ip-26-0-174-36.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 899925) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 899925 -[2]: - time : 2024-07-03_10:01:51 - host : ip-26-0-174-36.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 899926) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 899926 -[3]: - time : 2024-07-03_10:01:51 - host : ip-26-0-174-36.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 899927) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 899927 -[4]: - time : 2024-07-03_10:01:51 - host : ip-26-0-174-36.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 899928) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 899928 -[5]: - time : 2024-07-03_10:01:51 - host : ip-26-0-174-36.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 899929) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 899929 -[6]: - time : 2024-07-03_10:01:51 - host : ip-26-0-174-36.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 899930) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 899930 -[7]: - time : 2024-07-03_10:01:51 - host : ip-26-0-174-36.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 899931) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 899931 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:01:51 - host : ip-26-0-174-36.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 899924) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 899924 -============================================================ -srun: error: ip-26-0-161-153: task 0: Exited with exit code 1 -srun: error: ip-26-0-174-36: task 7: Exited with exit code 1 -E0703 10:01:54.112000 140645926610752 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3842468) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:01:54.132000 140645926610752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3842392_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:54.169000 140645926610752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3842392_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:54.178000 140645926610752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3842392_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:01:51 - host : ip-26-0-171-102.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 3842468) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3842468 -============================================================ -W0703 10:01:54.886000 140198103230208 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-43.ec2.internal_1024760_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-171-102: task 4: Exited with exit code 1 -W0703 10:01:55.180000 140391687689984 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-202.ec2.internal_1366229_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:55.253000 140284159969024 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-178.ec2.internal_584433_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:55.411000 140244874426112 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-7.ec2.internal_2090486_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:55.866000 140450330957568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-162-233.ec2.internal_1723944_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 10:01:56.227000 140397348423488 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1366306) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 10:01:56.232000 140250535159616 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 2090561) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:01:56.240000 140397348423488 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1366229_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:56.246000 140250535159616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-7.ec2.internal_2090486_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:56.269000 140397348423488 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1366229_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:56.277000 140250535159616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-7.ec2.internal_2090486_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:56.296000 140397348423488 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1366229_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -E0703 10:01:56.300000 140289820702528 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 584510) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-202.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 1366307) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1366307 -[2]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-202.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 1366308) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1366308 -[3]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-202.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 1366309) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1366309 -[4]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-202.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 1366310) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1366310 -[5]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-202.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 1366311) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1366311 -[6]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-202.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 1366312) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1366312 -[7]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-202.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 1366313) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1366313 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-202.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 1366306) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1366306 -============================================================ -W0703 10:01:56.304000 140250535159616 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-7.ec2.internal_2090486_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-7.ec2.internal - rank : 49 (local_rank: 1) - exitcode : -6 (pid: 2090562) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2090562 -[2]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-7.ec2.internal - rank : 50 (local_rank: 2) - exitcode : -6 (pid: 2090563) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2090563 -[3]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-7.ec2.internal - rank : 51 (local_rank: 3) - exitcode : -6 (pid: 2090564) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2090564 -[4]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-7.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 2090565) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2090565 -[5]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-7.ec2.internal - rank : 53 (local_rank: 5) - exitcode : -6 (pid: 2090566) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2090566 -[6]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-7.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 2090567) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2090567 -[7]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-7.ec2.internal - rank : 55 (local_rank: 7) - exitcode : -6 (pid: 2090568) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2090568 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:01:56 - host : ip-26-0-173-7.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 2090561) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2090561 -============================================================ -W0703 10:01:56.312000 140289820702528 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_584433_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 10:01:56.315000 140203763963712 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1024836) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 10:01:56.325000 140455991691072 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1724020) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:01:56.329000 140203763963712 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-43.ec2.internal_1024760_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:56.337000 140455991691072 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1723944_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:56.341000 140289820702528 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_584433_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:56.366000 140455991691072 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1723944_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:56.366000 140203763963712 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-43.ec2.internal_1024760_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:01:56.371000 140289820702528 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_584433_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:01:56 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 584511) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 584511 -[2]: - time : 2024-07-03_10:01:56 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 584512) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 584512 -[3]: - time : 2024-07-03_10:01:56 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 584513) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 584513 -[4]: - time : 2024-07-03_10:01:56 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 584514) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 584514 -[5]: - time : 2024-07-03_10:01:56 - host : ip-26-0-161-178.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 584515) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 584515 -[6]: - time : 2024-07-03_10:01:56 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 584516) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 584516 -[7]: - time : 2024-07-03_10:01:56 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 584517) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 584517 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:01:56 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 584510) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 584510 -============================================================ -W0703 10:01:56.397000 140203763963712 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-43.ec2.internal_1024760_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -W0703 10:01:56.399000 140455991691072 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1723944_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:01:56 - host : ip-26-0-163-43.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 1024837) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1024837 -[2]: - time : 2024-07-03_10:01:56 - host : ip-26-0-163-43.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 1024838) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1024838 -[3]: - time : 2024-07-03_10:01:56 - host : ip-26-0-163-43.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 1024839) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1024839 -[4]: - time : 2024-07-03_10:01:56 - host : ip-26-0-163-43.ec2.internal - rank : 28 (local_rank: 4) - exitcode : -6 (pid: 1024840) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1024840 -[5]: - time : 2024-07-03_10:01:56 - host : ip-26-0-163-43.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 1024841) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1024841 -[6]: - time : 2024-07-03_10:01:56 - host : ip-26-0-163-43.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 1024842) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1024842 -[7]: - time : 2024-07-03_10:01:56 - host : ip-26-0-163-43.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 1024843) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1024843 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:01:56 - host : ip-26-0-163-43.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 1024836) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1024836 -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:01:56 - host : ip-26-0-162-233.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 1724021) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1724021 -[2]: - time : 2024-07-03_10:01:56 - host : ip-26-0-162-233.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 1724022) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1724022 -[3]: - time : 2024-07-03_10:01:56 - host : ip-26-0-162-233.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 1724023) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1724023 -[4]: - time : 2024-07-03_10:01:56 - host : ip-26-0-162-233.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 1724024) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1724024 -[5]: - time : 2024-07-03_10:01:56 - host : ip-26-0-162-233.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 1724025) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1724025 -[6]: - time : 2024-07-03_10:01:56 - host : ip-26-0-162-233.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 1724026) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1724026 -[7]: - time : 2024-07-03_10:01:56 - host : ip-26-0-162-233.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 1724027) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1724027 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:01:56 - host : ip-26-0-162-233.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 1724020) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1724020 -============================================================ -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -srun: error: ip-26-0-173-202: task 6: Exited with exit code 1 -srun: error: ip-26-0-173-7: task 5: Exited with exit code 1 -srun: error: ip-26-0-162-233: task 2: Exited with exit code 1 -srun: error: ip-26-0-163-43: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/status.txt b/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-32_pp-2_mbz-8/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/bench.slurm deleted file mode 100644 index d3b91cc936c93809a0ad65b4a82be959047806c3..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/config.yaml deleted file mode 100644 index 0a58337c1d7c57fc12ebba0c4591e130037ff45b..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 1024 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 1 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/log.out deleted file mode 100644 index 8e3f44c42a2158afc12086c1512382157ce3b48e..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Wed Jul 3 05:54:33 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 05:54:37.857000 140269807015744 torch/distributed/run.py:757] -W0703 05:54:37.857000 140269807015744 torch/distributed/run.py:757] ***************************************** -W0703 05:54:37.857000 140269807015744 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:54:37.857000 140269807015744 torch/distributed/run.py:757] ***************************************** -W0703 05:54:37.876000 140096136595264 torch/distributed/run.py:757] -W0703 05:54:37.876000 140096136595264 torch/distributed/run.py:757] ***************************************** -W0703 05:54:37.876000 140096136595264 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:54:37.876000 140096136595264 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.047000 140334357559104 torch/distributed/run.py:757] -W0703 05:54:38.047000 140334357559104 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.047000 140334357559104 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:54:38.047000 140334357559104 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.156000 140225368581952 torch/distributed/run.py:757] -W0703 05:54:38.156000 140225368581952 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.156000 140225368581952 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:54:38.156000 140225368581952 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.170000 140507042469696 torch/distributed/run.py:757] -W0703 05:54:38.170000 140507042469696 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.170000 140507042469696 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:54:38.170000 140507042469696 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.277000 139625673328448 torch/distributed/run.py:757] -W0703 05:54:38.277000 139625673328448 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.277000 139625673328448 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:54:38.277000 139625673328448 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.794000 140602875393856 torch/distributed/run.py:757] -W0703 05:54:38.794000 140602875393856 torch/distributed/run.py:757] ***************************************** -W0703 05:54:38.794000 140602875393856 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:54:38.794000 140602875393856 torch/distributed/run.py:757] ***************************************** -W0703 05:54:39.168000 140364193937216 torch/distributed/run.py:757] -W0703 05:54:39.168000 140364193937216 torch/distributed/run.py:757] ***************************************** -W0703 05:54:39.168000 140364193937216 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:54:39.168000 140364193937216 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 05:55:03 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=64, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50304), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=1024, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1')), -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50304) -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 05:55:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: model = self._init_model_instance() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: model = self._init_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank11]: model = build_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: model = self._init_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: model = build_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: model = build_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: model = self._init_model_instance() -[default1]:[rank9]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = build_model( -[default0]:[rank8]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: model = build_model( -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank4]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: model = self._init_model( -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: model = self._init_model_instance() -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: model = self._init_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: model = self._init_model_instance() -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = build_model( -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: model = build_model( -[default6]:[rank6]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: model = build_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: model = self._init_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default6]:[rank6]: model = build_model( -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank14]: model = self._init_model_instance() -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: model = build_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: model = self._init_model_instance() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: model = self._init_model_instance() -[default3]:[rank3]: model = self._init_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: model = self._init_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: model = self._init_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank3]: model = build_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: model = build_model( -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: model = build_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: model = build_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: model = build_model( -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: model = self._init_model_instance() -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = self._init_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: model = build_model( -[def[default4]:[rank28]: model = self._init_model_instance() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -ault0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: model = self._init_model( -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: model = self._init_model( -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank27]: model = self._init_model_instance() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: model = self._init_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: model = build_model( -[default4]:[rank28]: model = build_model( -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: model = self._init_model_instance() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank21]: model = self._init_model_instance() -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: model = build_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank31]: model = self._init_model_instance() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: model = build_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank43]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: model = build_model( -[def[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -ault3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: model = self._init_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: model = build_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank47]: model = self._init_model_instance() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: model = build_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: model = self._init_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: model = build_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank19]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: model = self._init_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank19]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model_instance() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank22]: model = build_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: model = self._init_model_instance() -[default0]:[rank40]: model = self._init_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank44]: model = self._init_model_instance() -[default0]:[rank40]: model = build_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: model = build_model( -[default4]:[rank44]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: model = build_model( -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank45]: model = self._init_model_instance() -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank46]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank42]: model = self._init_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: model = build_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank20]: model = self._init_model_instance() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank20]: model = build_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: model = self._init_model_instance() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank25]: model = self._init_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank54]: model = self._init_model_instance() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank54]: model = self._init_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: model = build_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank49]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: model = self._init_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default3]:[rank51]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = self._init_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: model = build_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank58]: model = self._init_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: Traceback (most recent call last): -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: model = self._init_model_instance() -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: model = self._init_model_instance() -[default0]:[rank32]: model = self._init_model_instance() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank38]: model = build_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank37]: model = self._init_model( -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank62]: model = self._init_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: model = build_model( -[def[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: Traceback (most recent call last): -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -ault6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: model = build_model( -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = self._init_model( -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: model = build_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: model = self._init_model_instance() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: model = self._init_model_instance() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank52]: model = build_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: model = self._init_model( -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: model = build_model( -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: model = self._init_model_instance() -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: model = build_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: model = self._init_model_instance() -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: model = self._init_model( -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: model = build_model( -[default4]:[rank36]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank35]: model = build_model( -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: model = build_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: Traceback (most recent call last): -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = build_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: model = build_model( -[def[default5]:[rank53]: self.attn = CausalSelfAttention( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: model = self._init_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: model = build_model( -ault5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: model = self._init_model_instance() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank57]: model = self._init_model_instance() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank57]: model = build_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model_instance() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: model = self._init_model_instance() -[default3]:[rank59]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: model = self._init_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: Traceback (most recent call last): -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: model = self._init_model_instance() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: model = self._init_model( -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: model = build_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: model = self._init_model( -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank34]: model = build_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0703 05:55:10.370000 140096136595264 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1191409) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 05:55:10.372000 140269807015744 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 41281) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-78.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 1191410) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-78.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 1191411) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-78.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 1191412) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-78.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 1191413) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-78.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 1191414) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-78.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 1191415) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-78.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 1191416) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-78.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 1191409) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:55:10 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 41282) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_05:55:10 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 41283) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_05:55:10 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 41284) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_05:55:10 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 41285) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_05:55:10 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 41286) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_05:55:10 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 41287) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_05:55:10 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 41288) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:55:10 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 41281) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 05:55:10.467000 140334357559104 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 930462) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 05:55:10.466000 140507042469696 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 915196) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 05:55:10.466000 140364193937216 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1466640) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 05:55:10.467000 139625673328448 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3940701) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 05:55:10.467000 140225368581952 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 712058) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 05:55:10.468000 140602875393856 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3811221) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-153.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1466641) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-153.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1466642) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-153.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1466643) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-153.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1466644) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-153.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1466645) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-153.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1466646) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-153.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1466647) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-153.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1466640) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 3940702) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-62.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 3940703) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 3940704) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 3940705) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-62.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 3940706) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 3940707) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-62.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 3940708) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 3940701) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 915197) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 915198) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 915199) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 915200) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 915201) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 915202) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 915203) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 915196) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-138.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 712059) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-138.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 712060) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-138.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 712061) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-138.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 712062) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-138.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 712063) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-138.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 712064) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-138.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 712065) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/eltorch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 930463) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 930464) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 930465) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:55:10 - host : ip-26-0-161-138.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 712058) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -ic/errors.html -[4]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 930466) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 930467) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 930468) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 930469) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 930462) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-102.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 3811222) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-102.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 3811223) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-102.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 3811224) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-102.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 3811225) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-102.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 3811226) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-102.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 3811227) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-102.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 3811228) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_05:55:10 - host : ip-26-0-171-102.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 3811221) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 4: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-138: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/bench.slurm deleted file mode 100644 index bae56b5adf521d9b08b20c90650591c7ce345dcc..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/config.yaml deleted file mode 100644 index 6d5f9a286268b39b0fd6b35a88927768a68945f1..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 1 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 1024 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/log.out deleted file mode 100644 index 59c6d83d98abf5741dc80c9b0b8ab1bdbb48832c..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Tue Jul 2 23:41:03 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0702 23:41:05.986000 140488894637888 torch/distributed/run.py:757] -W0702 23:41:05.986000 140488894637888 torch/distributed/run.py:757] ***************************************** -W0702 23:41:05.986000 140488894637888 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:41:05.986000 140488894637888 torch/distributed/run.py:757] ***************************************** -W0702 23:41:05.995000 140226739267392 torch/distributed/run.py:757] -W0702 23:41:05.995000 140226739267392 torch/distributed/run.py:757] ***************************************** -W0702 23:41:05.995000 140226739267392 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:41:05.995000 140226739267392 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.001000 140536171882304 torch/distributed/run.py:757] -W0702 23:41:06.001000 140536171882304 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.001000 140536171882304 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:41:06.001000 140536171882304 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.020000 140117944973120 torch/distributed/run.py:757] -W0702 23:41:06.020000 140117944973120 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.020000 140117944973120 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:41:06.020000 140117944973120 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.034000 140598824511296 torch/distributed/run.py:757] -W0702 23:41:06.034000 140598824511296 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.034000 140598824511296 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:41:06.034000 140598824511296 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.035000 139859005994816 torch/distributed/run.py:757] -W0702 23:41:06.035000 139859005994816 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.035000 139859005994816 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:41:06.035000 139859005994816 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.054000 139982020859712 torch/distributed/run.py:757] -W0702 23:41:06.054000 139982020859712 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.054000 139982020859712 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:41:06.054000 139982020859712 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.121000 140327156496192 torch/distributed/run.py:757] -W0702 23:41:06.121000 140327156496192 torch/distributed/run.py:757] ***************************************** -W0702 23:41:06.121000 140327156496192 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:41:06.121000 140327156496192 torch/distributed/run.py:757] ***************************************** -[default0]:07/02/2024 23:41:26 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=64, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=1024, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024')), -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304) -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/02/2024 23:41:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: model = self._init_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: model = build_model( -[def[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: Traceback (most recent call last): -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -ault2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: model = self._init_model( -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = build_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: model = self._init_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: model = build_model( -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = build_model( -[def[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: model = self._init_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: model = build_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -ault7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: model = build_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: model = self._init_model_instance() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = self._init_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: model = build_model( -[def[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank20]: model = self._init_model_instance() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank20]: model = build_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank21]: model = self._init_model_instance() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: model = self._init_model_instance() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: model = self._init_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank11]: model = build_model( -[def[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = self._init_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: model = build_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank14]: model = self._init_model_instance() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: model = build_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank19]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: model = self._init_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank19]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank39]: Traceback (most recent call last): -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: model = self._init_model_instance() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank38]: model = build_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: model = self._init_model_instance() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: model = self._init_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: model = build_model( -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank35]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank35]: model = build_model( -[def[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = build_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: model = build_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: model = self._init_model_instance() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: model = self._init_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinault4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: model = self._init_model( -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -and-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: model = build_model( -[default3]:[rank3]: model = self._init_model_instance() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: model = self._init_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank57]: model = self._init_model_instance() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank3]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: model = build_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: model = self._init_model_instance() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: model = self._init_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: model = build_model( -[def[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -ault0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: model = self._init_model_instance() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: model = self._init_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank44]: model = build_model( -[def[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -ault4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model_instance() -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: model = build_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: model = build_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: model = self._init_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank34]: model = build_model( -[def[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: model = build_model( -[default5]:[ran[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -ault2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -k5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank62]: model = self._init_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: model = build_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: model = build_model( -[def[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -ault7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: model = self._init_model_instance() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: model = build_model( -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: model = self._init_model_instance() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = self._init_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default2]:[rank50]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default2]:[rank50]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: model = build_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank24]: model = self._init_model_instance() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: model = build_model( -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: model = self._init_model_instance() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[def[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: Traceback (most recent call last): -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank47]: model = self._init_model_instance() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: model = build_model( -[def[default2]:[rank26]: model = self._init_model_instance() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: model = self._init_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: model = build_model( -ault7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: model = build_model( -[def[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model_instance() -[default5]:[rank45]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: model = self._init_model_instance() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: model = self._init_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -ault4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: Traceback (most recent call last): -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: Traceback (most recent call last): -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: model = build_model( -[def[default4]:[rank52]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -ault6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank58]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: model = self._init_model( -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank28]: model = self._init_model_instance() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank28]: model = build_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: model = build_model( -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: model = self._init_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: model = build_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank31]: model = self._init_model_instance() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: model = build_model( -[def[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: model = build_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: model = self._init_model_instance() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: Traceback (most recent call last): -[default3]:[rank43]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: model = self._init_model_instance() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank42]: model = self._init_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: model = self._init_model( -[default2]:[rank42]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: model = self._init_model_instance() -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: model = build_model( -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: model = self._init_model_instance() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: model = self._init_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: model = build_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: model = self._init_model_instance() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: model = self._init_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: model = build_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank6]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: model = self._init_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: model = build_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: model = self._init_model_instance() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: model = build_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: model = self._init_model_instance() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank37]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: model = build_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: model = self._init_model_instance() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank2]: model = self._init_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank2]: model = build_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank54]: model = self._init_model_instance() - __init__ -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank54]: model = self._init_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: model = build_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0702 23:41:33.336000 140226739267392 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 487503) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:41:33.337000 139982020859712 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3183780) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:41:33.337000 140327156496192 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1824032) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:41:33.338000 140536171882304 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1099469) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:41:33.338000 140488894637888 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 865386) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:41:33.337000 139859005994816 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1796367) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:41:33.340000 140117944973120 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1023364) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:41:33.340000 140598824511296 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 860513) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 865387) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-73.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 865388) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-73.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 865389) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-73.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 865390) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-73.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 865391) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-73.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 865392) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-73.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 865393) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/e run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -rrors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-73.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 865386) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:41:33 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 1796368) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:41:33 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 1796369) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:41:33 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 1796370) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:41:33 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 1796371) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:41:33 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 1796372) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:41:33 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 1796373) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:41:33 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 1796374) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:41:33 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 1796367) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:41:33 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1099470) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:41:33 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1099471) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:41:33 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1099472) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:41:33 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1099473) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:41:33 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1099474) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:41:33 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1099475) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:41:33 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1099476) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:41:33 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1099469) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:41:33 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 487504) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:41:33 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 487505) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:41:33 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 487506) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:41:33 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 487507) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:41:33 - host : ip-26-0-161-178.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 487508) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:41:33 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 487509) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:41:33 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 487510) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:41:33 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 487503) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:41:33 - host : ip-26-0-168-238.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 1824033) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:41:33 - host : ip-26-0-168-238.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 1824034) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:41:33 - host : ip-26-0-168-238.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 1824035) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:41:33 - host : ip-26-0-168-238.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 1824036) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:41:33 - host : ip-26-0-168-238.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 1824037) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:41:33 - host : ip-26-0-168-238.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 1824038) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:41:33 - host : ip-26-0-168-238.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 1824039) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:41:33 - host : ip-26-0-168-238.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 1824032) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:41:33 - host : ip-26-0-163-226.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 3183781) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:41:33 - host : ip-26-0-163-226.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 3183782) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:41:33 - host : ip-26-0-163-226.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 3183783) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:41:33 - host : ip-26-0-163-226.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 3183784) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:41:33 - host : ip-26-0-163-226.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 3183785) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:41:33 - host : ip-26-0-163-226.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 3183786) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:41:33 - host : ip-26-0-163-226.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 3183787) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/st run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -able/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:41:33 - host : ip-26-0-163-226.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 3183780) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:41:33 - host : ip-26-0-165-24.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 860514) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:41:33 - host : ip-26-0-165-24.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 860515) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:41:33 - host : ip-26-0-165-24.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 860516) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:41:33 - host : ip-26-0-165-24.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 860517) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:41:33 - host : ip-26-0-165-24.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 860518) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:41:33 - host : ip-26-0-165-24.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 860519) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:41:33 - host : ip-26-0-165-24.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 860520) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:41:33 - host : ip-26-0-165-24.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 860513) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 1023365) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 1023366) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 1023367) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 1023368) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 1023369) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 1023370) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 1023371) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:41:33 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 1023364) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-1024/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/bench.slurm deleted file mode 100644 index 9e5bae4de3eb4be1d1af2573d6edb95b901b131f..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/config.yaml deleted file mode 100644 index d2ca47e59bd5543c10962fb706ef2f9d784ac2de..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 8 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 128 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/log.out deleted file mode 100644 index 5f40a8b7ca21d3f7e58a636fb6bcd4b95e1387ed..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Wed Jul 3 02:13:09 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 02:13:11.717000 140614836901696 torch/distributed/run.py:757] -W0703 02:13:11.717000 140614836901696 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.717000 140614836901696 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:13:11.717000 140614836901696 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.726000 139799379121984 torch/distributed/run.py:757] -W0703 02:13:11.726000 139799379121984 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.726000 139799379121984 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:13:11.726000 139799379121984 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.724000 140408779085632 torch/distributed/run.py:757] -W0703 02:13:11.724000 140408779085632 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.724000 140408779085632 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:13:11.724000 140408779085632 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.730000 140049506486080 torch/distributed/run.py:757] -W0703 02:13:11.730000 140049506486080 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.730000 140049506486080 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:13:11.730000 140049506486080 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.731000 139780677457728 torch/distributed/run.py:757] -W0703 02:13:11.731000 139780677457728 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.731000 139780677457728 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:13:11.731000 139780677457728 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.753000 140386803414848 torch/distributed/run.py:757] -W0703 02:13:11.753000 140386803414848 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.753000 140386803414848 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:13:11.753000 140386803414848 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.785000 139629394188096 torch/distributed/run.py:757] -W0703 02:13:11.785000 139629394188096 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.785000 139629394188096 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:13:11.785000 139629394188096 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.816000 139828360558400 torch/distributed/run.py:757] -W0703 02:13:11.816000 139828360558400 torch/distributed/run.py:757] ***************************************** -W0703 02:13:11.816000 139828360558400 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:13:11.816000 139828360558400 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 02:13:32 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=64, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=128, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=8, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128')), -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304) -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 02:13:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = self._init_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: model = build_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank20]: model = self._init_model_instance() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank20]: model = build_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: model = self._init_model_instance() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank2]: model = self._init_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank2]: model = build_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: model = self._init_model_instance() -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: Traceback (most recent call last): -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: Traceback (most recent call last): -[default3]:[rank27]: model = self._init_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: model = self._init_model_instance() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: model = build_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default6]:[rank30]: model = self._init_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: model = self._init_model_instance() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[def[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: model = build_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank26]: Traceback (most recent call last): -ault4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: Traceback (most recent call last): -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank14]: model = self._init_model_instance() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: model = build_model( -[def[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -ault6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: model = self._init_model_instance() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: model = build_model( -[def[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: model = self._init_model_instance() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default5]:[rank29]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.attn = CausalSelfAttention( -ault2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: model = self._init_model_instance() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank38]: model = build_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: model = self._init_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank29]: model = build_model( -[default0]:[rank24]: model = build_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank26]: model = build_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: model = self._init_model_instance() -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: Traceback (most recent call last): -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank57]: model = self._init_model_instance() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank57]: model = build_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: model = self._init_model( -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: model = self._init_model_instance() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: model = self._init_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank34]: model = build_model( -[def[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -ault2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: model = self._init_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: model = self._init_model_instance() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: model = self._init_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: model = build_model( -[default7]:[ran[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -k7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank8]: model = self._init_model_instance() -[default1]:[rank9]: model = self._init_model_instance() -[default7]:[rank15]: model = build_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: model = self._init_model( -[default0]:[rank8]: model = self._init_model( -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: model = build_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: model = build_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank19]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: model = self._init_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: model = self._init_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: model = build_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default2]:[rank18]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: Traceback (most recent call last): -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: model = self._init_model_instance() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: model = self._init_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank11]: model = build_model( -[def[default5]:[rank21]: self.model = self.init_model() # Defines self.model -ault3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: model = self._init_model( -[default5]:[rank21]: model = self._init_model_instance() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = build_model( -[def[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: model = build_model( -ault5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: model = self._init_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: model = build_model( -[def[default3]:[rank35]: Traceback (most recent call last): -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: model = build_model( -[default0]:[ran[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -k0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -ault5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: model = build_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default5]:[rank37]: model = self._init_model_instance() -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: model = build_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: model = self._init_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: model = build_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: model = build_model( -[def[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: model = build_model( -[def[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: model = build_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: model = self._init_model_instance() -ault0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: model = self._init_model_instance() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: model = self._init_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: model = build_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = Caus[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -alSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: model = self._init_model_instance() -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: Traceback (most recent call last): -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: model = self._init_model( -[default0]:[rank48]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model_instance() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: model = self._init_model_instance() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: model = build_model( -[default1]:[rank49]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank49]: model = self._init_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: model = self._init_model( -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: model = build_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: Traceback (most recent call last): -[default1]:[rank41]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: model = self._init_model_instance() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = self._init_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: model = build_model( -[def[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default7]:[rank47]: Traceback (most recent call last): -[default6]:[rank54]: model = self._init_model_instance() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: model = self._init_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -ault0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: model = self._init_model_instance() -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: Traceback (most recent call last): -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank52]: model = self._init_model_instance() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank52]: model = build_model( -[def[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank41]: model = self._init_model_instance() -[default3]:[rank43]: Traceback (most recent call last): -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -ault4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: model = build_model( -[default1]:[rank41]: model = self._init_model( -[default4]:[rank4]: Traceback (most recent call last): -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: model = build_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: model = build_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default7]:[rank63]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: model = build_model( -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: model = self._init_model_instance() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: model = self._init_model_instance() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: model = self._init_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: model = self._init_model( -[default6]:[rank6]: model = self._init_model_instance() -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: model = self._init_model_instance() -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: model = self._init_model( -[default3]:[rank3]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: model = self._init_model( -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: model = build_model( -[default3]:[rank43]: model = self._init_model_instance() -[default4]:[rank4]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank62]: model = self._init_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: model = build_model( -[def[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default3]:[rank3]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: model = build_model( -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -ault6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank46]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default6]:[rank46]: model = build_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: model = build_model( -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank28]: model = self._init_model( -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: model = build_model( -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: model = self._init_model_instance() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank25]: model = self._init_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: model = build_model( -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default4]:[rank44]: model = self._init_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank44]: model = build_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: model = build_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: model = self._init_model_instance() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = self._init_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: model = build_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0703 02:13:39.100000 139799379121984 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3209027) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:13:39.102000 140386803414848 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1823600) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:13:39 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 1823601) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:13:39 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 1823602) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:13:39 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 1823603) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:13:39 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 1823604) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:13:39 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 1823605) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:13:39 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 1823606) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:13:39 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 1823607) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:13:39 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 1823600) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-226.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 3209028) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-226.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 3209029) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-226.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 3209030) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-226.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 3209031) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-226.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 3209032) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-226.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 3209033) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-226.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 3209034) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-226.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 3209027) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 02:13:39.195000 140408779085632 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 762926) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:13:39.198000 139780677457728 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 513078) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:13:39.199000 139828360558400 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1126164) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:13:39.198000 140614836901696 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 891976) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:13:39.200000 139629394188096 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1849795) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:13:39.200000 140049506486080 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1049531) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:13:39 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1126165) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:13:39 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1126166) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:13:39 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1126167) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:13:39 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1126168) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:13:39 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1126169) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:13:39 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1126170) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:13:39 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1126171) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/el return f(*args, **kwargs) -astic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:13:39 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1126164) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 1049532) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 1049533) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 1049534) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 1049535) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 1049536) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 1049537) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 1049538) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 1049531) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 891977) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-73.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 891978) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-73.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 891979) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-73.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 891980) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-73.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 891981) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-73.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 891982) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-73.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 891983) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:13:39 - host : ip-26-0-172-73.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 891976) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:13:39 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 513079) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:13:39 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 513080) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:13:39 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 513081) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:13:39 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 513082) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:13:39 - host : ip-26-0-161-178.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 513083) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:13:39 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 513084) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:13:39 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 513085) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:13:39 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 513078) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:13:39 - host : ip-26-0-168-238.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 1849796) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:13:39 - host : ip-26-0-168-238.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 1849797) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:13:39 - host : ip-26-0-168-238.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 1849798) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:13:39 - host : ip-26-0-168-238.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 1849799) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:13:39 - host : ip-26-0-168-238.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 1849800) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:13:39 - host : ip-26-0-168-238.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 1849801) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:13:39 - host : ip-26-0-168-238.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 1849802) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:13:39 - host : ip-26-0-168-238.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 1849795) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-220.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 762927) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-220.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 762928) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-220.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 762929) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-220.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 762930) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-220.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 762931) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-220.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 762932) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-220.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 762933) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:13:39 - host : ip-26-0-163-220.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 762926) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-163-226: task 3: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 2: Exited with exit code 1 -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-128/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/bench.slurm deleted file mode 100644 index 31d578ec19221249e4fb51a3c855869c8a2f841b..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/config.yaml deleted file mode 100644 index 89a2642db44574f391e0318efb2fa1c83a4e23ee..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 64 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 16 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/log.out deleted file mode 100644 index c824f1003d77a7575dd02d0045b1fb3257cdb6d3..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Tue Jul 2 23:39:36 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0702 23:39:39.145000 140306096875328 torch/distributed/run.py:757] -W0702 23:39:39.145000 140306096875328 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.145000 140306096875328 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:39:39.145000 140306096875328 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.147000 139649365083968 torch/distributed/run.py:757] -W0702 23:39:39.147000 139649365083968 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.147000 139649365083968 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:39:39.147000 139649365083968 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.147000 139987077363520 torch/distributed/run.py:757] -W0702 23:39:39.147000 139987077363520 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.147000 139987077363520 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:39:39.147000 139987077363520 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.148000 140698504841024 torch/distributed/run.py:757] -W0702 23:39:39.148000 140698504841024 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.148000 140698504841024 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:39:39.148000 140698504841024 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.148000 140094763059008 torch/distributed/run.py:757] -W0702 23:39:39.148000 140094763059008 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.148000 140094763059008 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:39:39.148000 140094763059008 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.151000 139788966299456 torch/distributed/run.py:757] -W0702 23:39:39.151000 139788966299456 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.151000 139788966299456 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:39:39.151000 139788966299456 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.153000 139814066579264 torch/distributed/run.py:757] -W0702 23:39:39.153000 139814066579264 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.153000 139814066579264 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:39:39.153000 139814066579264 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.175000 140156427769664 torch/distributed/run.py:757] -W0702 23:39:39.175000 140156427769664 torch/distributed/run.py:757] ***************************************** -W0702 23:39:39.175000 140156427769664 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:39:39.175000 140156427769664 torch/distributed/run.py:757] ***************************************** -[default0]:07/02/2024 23:39:59 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=64, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50304), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=16, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=64, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16')), -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50304) -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/02/2024 23:39:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: model = build_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank4]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: model = self._init_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank6]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: model = self._init_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: model = build_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: model = self._init_model_instance() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: model = self._init_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank3]: model = build_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: model = build_model( -[def[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank35]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank35]: model = build_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: Traceback (most recent call last): -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: model = build_model( -[def[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -ault4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: Traceback (most recent call last): -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: model = self._init_model_instance() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = self._init_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: model = build_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: model = self._init_model_instance() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank37]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: Traceback (most recent call last): -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: model = self._init_model_instance() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: model = self._init_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: model = build_model( -[def[default5]:[rank5]: model = self._init_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: model = self._init_model( -[default5]:[rank5]: model = build_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default1]:[rank1]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: model = self._init_model_instance() -ault7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: model = self._init_model( -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: model = build_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: model = self._init_model_instance() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: model = self._init_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: model = build_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: model = self._init_model_instance() -[default6]:[rank38]: model = self._init_model_instance() -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: model = build_model( -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: model = build_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: model = self._init_model_instance() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: model = self._init_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: model = self._init_model( -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: model = build_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: model = self._init_model( -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: model = self._init_model( -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: model = build_model( -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank28]: model = self._init_model_instance() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank28]: model = build_model( -[def[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: model = build_model( -[def[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -ault4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: model = self._init_model_instance() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank25]: model = self._init_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: model = self._init_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: model = build_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: model = self._init_model_instance() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank52]: model = self._init_model_instance() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank52]: model = build_model( -[def[default0]:[rank8]: model = build_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -ault4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default6]:[rank14]: Traceback (most recent call last): -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = build_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank14]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: model = build_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: model = build_model( -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: model = self._init_model_instance() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: model = self._init_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: model = build_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank24]: model = self._init_model_instance() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: model = build_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: model = self._init_model_instance() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: model = self._init_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: model = build_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank31]: model = self._init_model_instance() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: model = build_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: model = self._init_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: model = self._init_model_instance() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = self._init_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: model = build_model( -[def[default2]:[rank34]: model = build_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank58]: model = self._init_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: model = build_model( -[def[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: model = self._init_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: model = build_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: model = self._init_model_instance() -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default0]:[rank16]: model = self._init_model_instance() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: model = self._init_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: model = build_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: model = build_model( -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank45]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: Traceback (most recent call last): -[default0]:[rank40]: Traceback (most recent call last): -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: Traceback (most recent call last): -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: model = self._init_model_instance() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: model = self._init_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank11]: model = build_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = build_model( -[def[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: model = self._init_model_instance() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -ault7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: model = build_model( -[default0]:[rank40]: model = self._init_model_instance() -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: model = self._init_model_instance() -[default0]:[rank40]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: model = build_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: model = self._init_model_instance() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model_instance() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: model = self._init_model( -[default0]:[rank40]: model = build_model( -[default7]:[rank47]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: model = self._init_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: model = build_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: Traceback (most recent call last): -[default4]:[rank60]: Traceback (most recent call last): -[default7]:[rank47]: model = build_model( -[default3]:[rank43]: model = build_model( -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: model = build_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default6]:[rank22]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = build_model( -[def[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -ault5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: model = self._init_model_instance() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank22]: model = build_model( -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: Traceback (most recent call last): -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: model = self._init_model_instance() -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: model = self._init_model_instance() -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: model = self._init_model( -[default7]:[rank23]: Traceback (most recent call last): -[default3]:[rank19]: model = self._init_model( -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: model = self._init_model_instance() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank54]: model = self._init_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: model = self._init_model_instance() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank57]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: model = self._init_model( -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default3]:[rank59]: Traceback (most recent call last): -[default4]:[rank60]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: model = build_model( -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: model = build_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: model = self._init_model( -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank21]: model = self._init_model_instance() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[def[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model_instance() -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -ault5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank59]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank59]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: model = self._init_model_instance() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: model = self._init_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank44]: model = build_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank46]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0702 23:40:05.577000 139649365083968 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1122470) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:40:05.577000 140306096875328 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1757459) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:40:05.578000 140698504841024 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1380073) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:40:05.579000 139987077363520 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3745268) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:40:05.579000 140094763059008 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 862934) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:40:05.579000 140156427769664 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3873708) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:40:05.580000 139788966299456 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1401178) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0702 23:40:05.579000 139814066579264 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 849628) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:40:05 - host : ip-26-0-162-233.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 1380074) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:40:05 - host : ip-26-0-162-233.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 1380075) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:40:05 - host : ip-26-0-162-233.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 1380076) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:40:05 - host : ip-26-0-162-233.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 1380077) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:40:05 - host : ip-26-0-162-233.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 1380078) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:40:05 - host : ip-26-0-162-233.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 1380079) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:40:05 - host : ip-26-0-162-233.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 1380080) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/st return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -able/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:40:05 - host : ip-26-0-162-233.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 1380073) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-153.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 1401179) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-153.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 1401180) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-153.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 1401181) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 1401182) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-153.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 1401183) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-153.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 1401184) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-153.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 1401185) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/st return launch_agent(self._config, self._entrypoint, list(args)) - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -able/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 1401178) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-102.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 3745269) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-102.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 3745270) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-102.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 3745271) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-102.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 3745272) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-102.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 3745273) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-102.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 3745274) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-102.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 3745275) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-102.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 3745268) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 849629) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 849630) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 849631) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 849632) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 849633) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 849634) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 849635) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 849628) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1122471) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-78.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1122472) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1122473) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1122474) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1122475) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-78.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1122476) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1122477) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:40:05 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1122470) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:40:05 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1757460) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:40:05 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1757461) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:40:05 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1757462) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:40:05 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1757463) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:40:05 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1757464) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:40:05 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1757465) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:40:05 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1757466) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:40:05 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1757459) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 3873709) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-62.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 3873710) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 3873711) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 3873712) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-62.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 3873713) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 3873714) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-62.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 3873715) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 3873708) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 862935) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 862936) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 862937) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 862938) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 862939) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 862940) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 862941) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-02_23:40:05 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 862934) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-162-233: task 4: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-16/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/bench.slurm deleted file mode 100644 index b2d980a11f8ea880d1e5bb2155e907c67dcef949..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/config.yaml deleted file mode 100644 index 4b1c21c74bb95953a5db05f908d4f19f54bc5507..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 512 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 2 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/log.out deleted file mode 100644 index 4a08247e0bcb87e9dc7f75dcca8f8ea974a347ca..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/log.out +++ /dev/null @@ -1,2053 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:36:40 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:36:46.626000 139982100809536 torch/distributed/run.py:757] -W0703 09:36:46.626000 139982100809536 torch/distributed/run.py:757] ***************************************** -W0703 09:36:46.626000 139982100809536 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:36:46.626000 139982100809536 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.589000 139874149406528 torch/distributed/run.py:757] -W0703 09:36:47.589000 139874149406528 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.589000 139874149406528 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:36:47.589000 139874149406528 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.597000 140606782822208 torch/distributed/run.py:757] -W0703 09:36:47.597000 140606782822208 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.597000 140606782822208 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:36:47.597000 140606782822208 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.608000 139737384494912 torch/distributed/run.py:757] -W0703 09:36:47.608000 139737384494912 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.608000 139737384494912 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:36:47.608000 139737384494912 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.641000 140411994597184 torch/distributed/run.py:757] -W0703 09:36:47.641000 140411994597184 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.641000 140411994597184 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:36:47.641000 140411994597184 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.645000 139849914173248 torch/distributed/run.py:757] -W0703 09:36:47.645000 139849914173248 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.645000 139849914173248 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:36:47.645000 139849914173248 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.743000 139650165561152 torch/distributed/run.py:757] -W0703 09:36:47.743000 139650165561152 torch/distributed/run.py:757] ***************************************** -W0703 09:36:47.743000 139650165561152 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:36:47.743000 139650165561152 torch/distributed/run.py:757] ***************************************** -W0703 09:36:48.036000 140589555636032 torch/distributed/run.py:757] -W0703 09:36:48.036000 140589555636032 torch/distributed/run.py:757] ***************************************** -W0703 09:36:48.036000 140589555636032 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:36:48.036000 140589555636032 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:37:13 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=64, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50304), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=2, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=512, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2')), -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50304) -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 09:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank24]: model = self._init_model_instance() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: model = build_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: model = self._init_model_instance() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: model = self._init_model_instance() -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: model = self._init_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: model = self._init_model_instance() -[default1]:[rank25]: model = self._init_model( -[default2]:[rank26]: model = build_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: model = build_model( -[default3]:[rank27]: model = self._init_model( -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank27]: model = build_model( -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: model = self._init_model_instance() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: model = self._init_model_instance() -[default4]:[rank52]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: model = build_model( -[default7]:[rank31]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: model = self._init_model( -[default0]:[rank48]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: model = build_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: model = build_model( -[default0]:[rank48]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: model = build_model( -[default4]:[rank28]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default4]:[rank28]: model = build_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: model = self._init_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: model = self._init_model_instance() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = self._init_model_instance() -[default6]:[rank54]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank54]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: model = self._init_model( -[default3]:[rank51]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: model = build_model( -[default3]:[rank51]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: model = self._init_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: model = build_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: model = build_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = self._init_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: model = build_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: model = self._init_model_instance() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank2]: model = self._init_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank2]: model = build_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: block.build_and_set_rank(tar[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: model = self._init_model_instance() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: model = self._init_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: model = build_model( -[defget_pp_ranks[target_pp_rank_idx]) -ault7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model_instance() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank22]: model = build_model( -[def[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank34]: Traceback (most recent call last): -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_atten[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: model = self._init_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -tion_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: model = build_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: model = self._init_model_instance() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default5]:[rank21]: model = build_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: model = self._init_model_instance() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank37]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: model = build_model( -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: model = build_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: model = build_model( -[default6]:[rank38]: model = build_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: model = self._init_model_instance() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: model = self._init_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: model = build_model( -[default1]:[ran[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -k1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: model = self._init_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: model = build_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: Traceback (most recent call last): -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: model = build_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: model = self._init_model_instance() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: model = self._init_model( -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: Traceback (most recent call last): -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank3]: model = build_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model_instance() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default3]:[rank19]: model = self._init_model_instance() -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: Traceback (most recent call last): -[default3]:[rank59]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: model = self._init_model( -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: model = self._init_model( -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank4]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: model = self._init_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: model = self._init_model_instance() -[default4]:[rank4]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank58]: model = self._init_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: Traceback (most recent call last): -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank14]: model = self._init_model_instance() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: model = build_model( -[def[default3]:[rank35]: model = build_model( -[default6]:[rank6]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: model = self._init_model( -[default2]:[rank18]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: model = build_model( -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: model = self._init_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -ault6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank36]: model = self._init_model( -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: model = build_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: model = self._init_model_instance() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: model = self._init_model( -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank20]: model = self._init_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: model = build_model( -[default4]:[rank36]: model = build_model( -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: model = build_model( -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: model = build_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default1]:[rank57]: Traceback (most recent call last): -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: Traceback (most recent call last): -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: model = self._init_model_instance() -[default7]:[rank63]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: Traceback (most recent call last): -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: model = self._init_model_instance() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: model = self._init_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: model = self._init_model( -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: model = self._init_model_instance() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: model = build_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: model = self._init_model( -[default0]:[rank8]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: model = self._init_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: model = build_model( -[default0]:[rank56]: model = build_model( -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: model = self._init_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: model = build_model( -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: model = build_model( -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: model = build_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: model = self._init_model_instance() -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: model = self._init_model( -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank62]: model = build_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: model = self._init_model( -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: model = self._init_model( -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: model = build_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default4]:[rank60]: model = build_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: model = self._init_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = build_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank45]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: model = self._init_model_instance() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank42]: model = self._init_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: model = build_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank46]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank47]: model = self._init_model_instance() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: model = build_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: Traceback (most recent call last): -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: model = build_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: model = self._init_model_instance() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: model = self._init_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank44]: model = build_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: model = self._init_model_instance() -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank43]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: model = build_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -W0703 09:37:19.004000 140589555636032 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 472069 closing signal SIGTERM -W0703 09:37:19.005000 140589555636032 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 472071 closing signal SIGTERM -W0703 09:37:19.005000 140589555636032 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 472072 closing signal SIGTERM -E0703 09:37:19.017000 139737384494912 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 205524) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:37:19 - host : ip-26-0-166-125.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 205525) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:37:19 - host : ip-26-0-166-125.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 205526) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:37:19 - host : ip-26-0-166-125.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 205527) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:37:19 - host : ip-26-0-166-125.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 205528) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:37:19 - host : ip-26-0-166-125.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 205529) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:37:19 - host : ip-26-0-166-125.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 205530) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:37:19 - host : ip-26-0-166-125.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 205531) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:37:19 - host : ip-26-0-166-125.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 205524) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:37:19.111000 139874149406528 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 42656) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:37:19.113000 139849914173248 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 961657) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:37:19.113000 140606782822208 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 859519) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:37:19.113000 140411994597184 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 743873) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:37:19.116000 139982100809536 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 76374) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:37:19.115000 139650165561152 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 946832) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:37:18 - host : ip-26-0-163-147.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 859520) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:37:18 - host : ip-26-0-163-147.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 859521) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:37:18 - host : ip-26-0-163-147.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 859522) - error_file: - traceback : To enable traceback see: https: sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -//pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:37:19 - host : ip-26-0-163-147.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 859523) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:37:19 - host : ip-26-0-163-147.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 859524) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:37:19 - host : ip-26-0-163-147.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 859525) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:37:19 - host : ip-26-0-163-147.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 859526) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:37:18 - host : ip-26-0-163-147.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 859519) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:37:18 - host : ip-26-0-161-138.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 743874) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:37:18 - host : ip-26-0-161-138.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 743875) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:37:18 - host : ip-26-0-161-138.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 743876) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:37:18 - host : ip-26-0-161-138.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 743877) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:37:18 - host : ip-26-0-161-138.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 743878) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:37:18 - host : ip-26-0-161-138.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 743879) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:37:18 - host : ip-26-0-161-138.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 743880) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:37:18 - host : ip-26-0-161-138.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 743873) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:37:19 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 76375) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:37:19 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 76376) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:37:19 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 76377) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:37:19 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 76378) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:37:19 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 76379) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:37:19 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 76380) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:37:19 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 76381) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.h sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -tml ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:37:19 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 76374) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 946833) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 946834) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 946835) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 946836) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 946837) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 946838) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 946839) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/ela elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -stic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 946832) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:37:19 - host : ip-26-0-165-24.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 961658) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:37:19 - host : ip-26-0-165-24.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 961659) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:37:19 - host : ip-26-0-165-24.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 961660) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:37:19 - host : ip-26-0-165-24.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 961661) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:37:19 - host : ip-26-0-165-24.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 961662) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:37:19 - host : ip-26-0-165-24.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 961663) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:37:19 - host : ip-26-0-165-24.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 961664) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:37:19 - host : ip-26-0-165-24.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 961657) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 42657) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-78.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 42658) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 42659) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 42660) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 42661) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-78.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 42662) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 42663) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:37:19 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 42656) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:37:19.221000 140589555636032 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 472065) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:37:19.247000 140589555636032 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_471989_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:37:19.272000 140589555636032 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_471989_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:37:19.295000 140589555636032 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_471989_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:37:19 - host : ip-26-0-164-207.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 472066) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:37:19 - host : ip-26-0-164-207.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 472067) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:37:19 - host : ip-26-0-164-207.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 472068) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:37:19 - host : ip-26-0-164-207.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 472070) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:37:19 - host : ip-26-0-164-207.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 472065) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-166-125: task 7: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -srun: error: ip-26-0-165-24: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-138: task 3: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -srun: error: ip-26-0-163-147: task 4: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-2/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/bench.slurm deleted file mode 100644 index fea2b99b5b2bc0d968d7a81229a10997b7a47b24..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/config.yaml deleted file mode 100644 index db4691e1ba442657d4f45d456d35d1506c98d107..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 4 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 256 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/log.out deleted file mode 100644 index 16d2884ff967eacb228a00eb7f5f056b28dd9921..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Wed Jul 3 02:58:03 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 02:58:09.145000 140002791987008 torch/distributed/run.py:757] -W0703 02:58:09.145000 140002791987008 torch/distributed/run.py:757] ***************************************** -W0703 02:58:09.145000 140002791987008 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:09.145000 140002791987008 torch/distributed/run.py:757] ***************************************** -W0703 02:58:09.280000 139631123212096 torch/distributed/run.py:757] -W0703 02:58:09.280000 139631123212096 torch/distributed/run.py:757] ***************************************** -W0703 02:58:09.280000 139631123212096 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:09.280000 139631123212096 torch/distributed/run.py:757] ***************************************** -W0703 02:58:09.616000 139712217130816 torch/distributed/run.py:757] -W0703 02:58:09.616000 139712217130816 torch/distributed/run.py:757] ***************************************** -W0703 02:58:09.616000 139712217130816 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:09.616000 139712217130816 torch/distributed/run.py:757] ***************************************** -W0703 02:58:09.638000 140224119809856 torch/distributed/run.py:757] -W0703 02:58:09.638000 140224119809856 torch/distributed/run.py:757] ***************************************** -W0703 02:58:09.638000 140224119809856 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:09.638000 140224119809856 torch/distributed/run.py:757] ***************************************** -W0703 02:58:10.015000 140629962319680 torch/distributed/run.py:757] -W0703 02:58:10.015000 140629962319680 torch/distributed/run.py:757] ***************************************** -W0703 02:58:10.015000 140629962319680 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:10.015000 140629962319680 torch/distributed/run.py:757] ***************************************** -W0703 02:58:10.046000 140428076730176 torch/distributed/run.py:757] -W0703 02:58:10.046000 140428076730176 torch/distributed/run.py:757] ***************************************** -W0703 02:58:10.046000 140428076730176 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:10.046000 140428076730176 torch/distributed/run.py:757] ***************************************** -W0703 02:58:10.099000 139856365946688 torch/distributed/run.py:757] -W0703 02:58:10.099000 139856365946688 torch/distributed/run.py:757] ***************************************** -W0703 02:58:10.099000 139856365946688 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:10.099000 139856365946688 torch/distributed/run.py:757] ***************************************** -W0703 02:58:10.197000 140451105965888 torch/distributed/run.py:757] -W0703 02:58:10.197000 140451105965888 torch/distributed/run.py:757] ***************************************** -W0703 02:58:10.197000 140451105965888 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:58:10.197000 140451105965888 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 02:58:35 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config: -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: run='%date_%jobid', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: step=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: consumed_train_samples=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: benchmark_csv_path=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ignore_sanity_checks=True), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp=64, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp_engine=, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_mode=, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_linear_async_communication=False, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: expert_parallel_size=1), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50304), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dtype=torch.bfloat16, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_revision=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_max_length=None), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoint_interval=100000, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: save_initial_state=False, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: resume_checkpoint_path=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: log_level_replica='info', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration_step_info_interval=1), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: train_steps=20, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: micro_batch_size=256, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: batch_accumulation_per_replica=4, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: val_check_interval=-1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_val_batches=0, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_test_batches=0), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta1=0.9, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta2=0.95, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: torch_adam_is_fused=True, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: name='adamW'), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: zero_stage=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: weight_decay=0.01, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: clip_grad=1.0, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_steps=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_style='linear', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_style='linear', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_steps=19, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_starting_step=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: min_decay_lr=1e-05)), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: start_training_step=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_splits='train', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_config_name=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_overwrite_cache=False, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: text_column_name='text'), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_loading_workers=0))], -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256')), -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lighteval=None) -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Model Config: -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50304) -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Building model.. -[default0]:07/03/2024 02:58:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Setting PP block ranks... -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: model = build_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: model = self._init_model_instance() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: model = self._init_model_instance() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: model = self._init_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank58]: model = self._init_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model_instance() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: model = build_model( -[default2]:[rank58]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default6]:[rank62]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank62]: model = self._init_model( -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: model = build_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: model = build_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: model = self._init_model_instance() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank57]: model = build_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: model = build_model( -[default0]:[ran[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -k0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: model = self._init_model_instance() -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model_instance() -[default1]:[rank41]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: model = build_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: model = self._init_model_instance() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: model = self._init_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank3]: model = build_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank43]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: model = build_model( -[def[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: model = self._init_model_instance() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: model = self._init_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: model = build_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: model = self._init_model_instance() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: model = self._init_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank44]: model = build_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank4]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: model = self._init_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank6]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank46]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: model = build_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: model = build_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: model = self._init_model_instance() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank2]: model = self._init_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank2]: model = build_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: model = self._init_model_instance() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: model = self._init_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: model = build_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = self._init_model( -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = build_model( -[default1]:[rank17]: model = self._init_model( -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: model = build_model( -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: model = self._init_model( -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: model = self._init_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: model = build_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank21]: model = self._init_model_instance() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model_instance() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank22]: model = build_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank47]: model = self._init_model_instance() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: model = build_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: model = self._init_model( -[default3]:[rank11]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: model = self._init_model( -[default5]:[rank13]: model = build_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank11]: model = build_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: model = self._init_model_instance() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank42]: model = self._init_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: model = build_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank14]: model = self._init_model_instance() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: model = build_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: model = self._init_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = build_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = build_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: model = self._init_model_instance() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: model = build_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: model = build_model( -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank20]: model = self._init_model_instance() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank20]: model = build_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: model = build_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: model = self._init_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: model = build_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = build_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank54]: model = self._init_model_instance() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank54]: model = self._init_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: model = self._init_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: model = self._init_model_instance() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: model = self._init_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: model = build_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: model = self._init_model_instance() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = self._init_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: model = build_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank52]: model = self._init_model_instance() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank52]: model = build_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank24]: model = self._init_model_instance() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: model = build_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: model = self._init_model_instance() -[default3]:[rank27]: model = self._init_model_instance() -[default2]:[rank26]: model = self._init_model_instance() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: model = self._init_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: model = build_model( -[default3]:[rank27]: model = self._init_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank25]: model = self._init_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: model = build_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default7]:[rank31]: model = self._init_model_instance() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank28]: model = self._init_model_instance() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank28]: model = build_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: model = self._init_model_instance() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank38]: model = build_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: model = build_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: model = self._init_model_instance() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = self._init_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: model = build_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank35]: model = build_model( -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank33]: Traceback (most recent call last): -[default2]:[rank34]: Traceback (most recent call last): -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: model = build_model( -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: model = self._init_model_instance() -[default2]:[rank34]: model = self._init_model_instance() -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank37]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: model = self._init_model( -[default5]:[rank37]: model = build_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default2]:[rank34]: model = build_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: model = self._init_model_instance() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: model = self._init_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: model = build_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0703 02:58:41.491000 140451105965888 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 403823) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:58:41 - host : ip-26-0-164-207.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 403824) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:58:41 - host : ip-26-0-164-207.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 403825) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:58:41 - host : ip-26-0-164-207.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 403826) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:58:41 - host : ip-26-0-164-207.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 403827) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:58:41 - host : ip-26-0-164-207.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 403828) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:58:41 - host : ip-26-0-164-207.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 403829) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:58:41 - host : ip-26-0-164-207.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 403830) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:58:41 - host : ip-26-0-164-207.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 403823) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 02:58:41.581000 140224119809856 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 318787) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:58:41.584000 139712217130816 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 320290) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:58:41.585000 140002791987008 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1656410) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:58:41.584000 140629962319680 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 893237) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:58:41.583000 140428076730176 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 790565) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:58:41.587000 139631123212096 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 832684) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:58:41.586000 139856365946688 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 201657) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:58:41 - host : ip-26-0-173-246.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 318788) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:58:41 - host : ip-26-0-173-246.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 318789) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:58:41 - host : ip-26-0-173-246.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 318790) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:58:41 - host : ip-26-0-173-246.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 318791) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:58:41 - host : ip-26-0-173-246.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 318792) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:58:41 - host : ip-26-0-173-246.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 318793) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:58:41 - host : ip-26-0-173-246.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 318794) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:58:41 - host : ip-26-0-173-246.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 318787) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-247.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 320291) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-247.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 320292) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-247.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 320293) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-247.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 320294) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-247.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 320295) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-247.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 320296) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-247.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 320297) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-247.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 320290) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:58:41 - host : ip-26-0-174-36.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 832685) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:58:41 - host : ip-26-0-174-36.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 832686) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:58:41 - host : ip-26-0-174-36.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 832687) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:58:41 - host : ip-26-0-174-36.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 832688) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:58:41 - host : ip-26-0-174-36.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 832689) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:58:41 - host : ip-26-0-174-36.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 832690) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:58:41 - host : ip-26-0-174-36.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 832691) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:58:41 - host : ip-26-0-174-36.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 832684) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:58:41 - host : ip-26-0-165-24.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 893238) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:58:41 - host : ip-26-0-165-24.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 893239) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:58:41 - host : ip-26-0-165-24.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 893240) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:58:41 - host : ip-26-0-165-24.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 893241) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:58:41 - host : ip-26-0-165-24.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 893242) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:58:41 - host : ip-26-0-165-24.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 893243) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:58:41 - host : ip-26-0-165-24.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 893244) - error_file: - traceback : To enable traceback see: https://pytorTraceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -ch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:58:41 - host : ip-26-0-165-24.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 893237) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:58:41 - host : ip-26-0-162-233.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1656411) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:58:41 - host : ip-26-0-162-233.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1656412) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:58:41 - host : ip-26-0-162-233.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1656413) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/el return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -astic/errors.html -[4]: - time : 2024-07-03_02:58:41 - host : ip-26-0-162-233.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1656414) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:58:41 - host : ip-26-0-162-233.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1656415) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:58:41 - host : ip-26-0-162-233.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1656416) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:58:41 - host : ip-26-0-162-233.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1656417) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:58:41 - host : ip-26-0-162-233.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1656410) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:58:41 - host : ip-26-0-163-147.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 790566) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:58:41 - host : ip-26-0-163-147.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 790567) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:58:41 - host : ip-26-0-163-147.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 790568) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:58:41 - host : ip-26-0-163-147.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 790569) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:58:41 - host : ip-26-0-163-147.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 790570) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:58:41 - host : ip-26-0-163-147.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 790571) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:58:41 - host : ip-26-0-163-147.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 790572) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:58:41 - host : ip-26-0-163-147.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 790565) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-139.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 201658) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-139.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 201659) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-139.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 201660) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-139.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 201661) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-139.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 201662) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-139.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 201663) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-139.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 201664) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:58:41 - host : ip-26-0-169-139.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 201657) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-169-247: task 5: Exited with exit code 1 -srun: error: ip-26-0-173-246: task 6: Exited with exit code 1 -srun: error: ip-26-0-163-147: task 1: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 2: Exited with exit code 1 -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -srun: error: ip-26-0-174-36: task 7: Exited with exit code 1 -srun: error: ip-26-0-169-139: task 4: Exited with exit code 1 -srun: error: ip-26-0-162-233: task 0: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-256/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/bench.slurm deleted file mode 100644 index c2b3f035216228af1341d7409f17e34faed19ae9..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/config.yaml deleted file mode 100644 index 93b444d438539b7c0233215e24cc82d026609d07..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 32 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 32 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/log.out deleted file mode 100644 index 55271df6270e23e38bbb0d417b94b0f8551b7289..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Wed Jul 3 08:41:16 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 08:41:22.158000 140150072264512 torch/distributed/run.py:757] -W0703 08:41:22.158000 140150072264512 torch/distributed/run.py:757] ***************************************** -W0703 08:41:22.158000 140150072264512 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 08:41:22.158000 140150072264512 torch/distributed/run.py:757] ***************************************** -W0703 08:41:22.460000 140461360998208 torch/distributed/run.py:757] -W0703 08:41:22.460000 140461360998208 torch/distributed/run.py:757] ***************************************** -W0703 08:41:22.460000 140461360998208 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 08:41:22.460000 140461360998208 torch/distributed/run.py:757] ***************************************** -W0703 08:41:22.741000 140198317426496 torch/distributed/run.py:757] -W0703 08:41:22.741000 140198317426496 torch/distributed/run.py:757] ***************************************** -W0703 08:41:22.741000 140198317426496 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 08:41:22.741000 140198317426496 torch/distributed/run.py:757] ***************************************** -W0703 08:41:22.816000 139687847597888 torch/distributed/run.py:757] -W0703 08:41:22.816000 139687847597888 torch/distributed/run.py:757] ***************************************** -W0703 08:41:22.816000 139687847597888 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 08:41:22.816000 139687847597888 torch/distributed/run.py:757] ***************************************** -W0703 08:41:22.881000 139714960070464 torch/distributed/run.py:757] -W0703 08:41:22.881000 139714960070464 torch/distributed/run.py:757] ***************************************** -W0703 08:41:22.881000 139714960070464 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 08:41:22.881000 139714960070464 torch/distributed/run.py:757] ***************************************** -W0703 08:41:23.050000 140126356367168 torch/distributed/run.py:757] -W0703 08:41:23.050000 140126356367168 torch/distributed/run.py:757] ***************************************** -W0703 08:41:23.050000 140126356367168 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 08:41:23.050000 140126356367168 torch/distributed/run.py:757] ***************************************** -W0703 08:41:23.066000 140349233342272 torch/distributed/run.py:757] -W0703 08:41:23.066000 140349233342272 torch/distributed/run.py:757] ***************************************** -W0703 08:41:23.066000 140349233342272 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 08:41:23.066000 140349233342272 torch/distributed/run.py:757] ***************************************** -W0703 08:41:23.386000 140103114262336 torch/distributed/run.py:757] -W0703 08:41:23.386000 140103114262336 torch/distributed/run.py:757] ***************************************** -W0703 08:41:23.386000 140103114262336 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 08:41:23.386000 140103114262336 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 08:41:48 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=64, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=32, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=32, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32')), -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304) -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 08:41:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model_instance() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: model = build_model( -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: model = self._init_model( -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank56]: Traceback (most recent call last): -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank62]: model = self._init_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = self._init_model_instance() -[default6]:[rank62]: model = build_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank56]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: Traceback (most recent call last): -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank57]: model = self._init_model_instance() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: model = self._init_model_instance() -[default7]:[rank63]: model = build_model( -[default1]:[rank57]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: model = self._init_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: model = build_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: model = self._init_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: model = build_model( -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: model = self._init_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank16]: model = build_model( -[default1]:[rank17]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: model = self._init_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: model = build_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: model = self._init_model_instance() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: model = self._init_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: model = build_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: model = build_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: model = self._init_model_instance() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank42]: model = self._init_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: model = build_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: model = self._init_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: model = build_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: Traceback (most recent call last): -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank19]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model_instance() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank19]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank22]: model = build_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: model = build_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank20]: model = self._init_model_instance() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank20]: model = build_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank21]: model = self._init_model_instance() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank45]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank3]: Traceback (most recent call last): -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: model = self._init_model_instance() -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: model = self._init_model_instance() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: Traceback (most recent call last): -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: model = self._init_model( -[default2]:[rank2]: model = self._init_model( -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: model = self._init_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank2]: model = build_model( -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: model = build_model( -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank46]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: model = build_model( -[def[default6]:[rank6]: model = self._init_model_instance() -[default1]:[rank1]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -ault6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default3]:[rank51]: model = self._init_model_instance() -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: model = self._init_model_instance() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank25]: model = self._init_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: model = build_model( -[def[default4]:[rank4]: model = self._init_model_instance() -[default6]:[rank54]: model = self._init_model( -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: model = self._init_model_instance() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: model = build_model( -[default4]:[rank4]: model = self._init_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: model = self._init_model( -ault1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank3]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: model = build_model( -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: model = self._init_model( -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: model = self._init_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = build_model( -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: model = build_model( -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: model = self._init_model_instance() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: model = self._init_model( -[default0]:[rank40]: Traceback (most recent call last): -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank3]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: model = build_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default3]:[rank3]: model = build_model( -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank52]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank6]: model = build_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: model = build_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: model = self._init_model( -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: model = self._init_model_instance() -[default1]:[rank41]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: model = self._init_model_instance() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: model = self._init_model( -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: model = build_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank43]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: model = build_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank28]: model = self._init_model_instance() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank28]: model = build_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank24]: model = self._init_model_instance() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: model = build_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank31]: model = self._init_model_instance() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: model = build_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: model = self._init_model_instance() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: model = self._init_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: model = build_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: model = build_model( -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: model = self._init_model_instance() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = self._init_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: model = build_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: model = build_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: model = self._init_model_instance() -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: model = build_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: model = self._init_model_instance() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: model = self._init_model_instance() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: model = build_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default5]:[rank37]: model = self._init_model( -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: model = build_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank37]: model = build_model( -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: model = self._init_model_instance() -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: model = self._init_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank11]: model = build_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: model = self._init_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank34]: model = build_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: Traceback (most recent call last): -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: model = self._init_model_instance() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: Traceback (most recent call last): -[default6]:[rank38]: model = build_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank14]: Traceback (most recent call last): -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: model = self._init_model_instance() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: model = self._init_model_instance() -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: model = build_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = self._init_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: model = self._init_model_instance() -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank35]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank35]: model = build_model( -[def[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: model = build_model( -[default1]:[rank9]: model = build_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -ault3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: model = build_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: model = build_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: model = self._init_model_instance() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: model = self._init_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: model = build_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0703 08:41:54.572000 140461360998208 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1198119) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_08:41:54 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1198120) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_08:41:54 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1198121) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_08:41:54 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1198122) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_08:41:54 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1198123) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_08:41:54 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1198124) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_08:41:54 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1198125) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_08:41:54 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1198126) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_08:41:54 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1198119) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 08:41:54.666000 140198317426496 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3277067) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 08:41:54.669000 140349233342272 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1120453) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 08:41:54.667000 139687847597888 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 962213) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 08:41:54.667000 140103114262336 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 830415) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 08:41:54.668000 140150072264512 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1892450) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 08:41:54.670000 139714960070464 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2463294) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 08:41:54.671000 140126356367168 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1919503) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 962214) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-73.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 962215) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-73.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 962216) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-73.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 962217) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-73.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 962218) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-73.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 962219) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-73.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 962220) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-73.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 962213) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-226.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 3277068) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-226.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 3277069) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-226.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 3277070) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-226.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 3277071) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-226.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 3277072) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-226.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 3277073) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-226.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 3277074) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-226.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 3277067) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 1120454) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 1120455) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 1120456) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 1120457) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 1120458) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 1120459) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 1120460) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_08:41:54 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 1120453) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-220.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 830416) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-220.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 830417) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-220.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 830418) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-220.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 830419) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-220.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 830420) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-220.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 830421) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-220.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 830422) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_08:41:54 - host : ip-26-0-163-220.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 830415) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_08:41:54 - host : ip-26-0-168-238.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1919504) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_08:41:54 - host : ip-26-0-168-238.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1919505) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_08:41:54 - host : ip-26-0-168-238.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1919506) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_08:41:54 - host : ip-26-0-168-238.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1919507) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_08:41:54 - host : ip-26-0-168-238.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1919508) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_08:41:54 - host : ip-26-0-168-238.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1919509) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_08:41:54 - host : ip-26-0-168-238.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1919510) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_08:41:54 - host : ip-26-0-168-238.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1919503) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-132.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 2463295) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-132.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 2463296) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-132.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 2463297) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-132.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 2463298) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-132.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 2463299) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-132.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 2463300) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-132.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 2463301) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-132.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 2463294) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 1892451) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 1892452) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 1892453) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 1892454) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 1892455) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 1892456) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 1892457) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_08:41:54 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 1892450) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 4: Exited with exit code 1 -srun: error: ip-26-0-169-132: task 5: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 1: Exited with exit code 1 -srun: error: ip-26-0-168-238: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-32/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/bench.slurm deleted file mode 100644 index 85517e1651485c88f310e6d87339adc3a460e352..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/config.yaml deleted file mode 100644 index 8dc8871d06684b67c091ed1041c306fd7af22698..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 256 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 4 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/log.out deleted file mode 100644 index d4f2c3abcacee7a836bc5391b2468bd78e78a81c..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:05:18 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:05:26.510000 140551663458112 torch/distributed/run.py:757] -W0703 03:05:26.510000 140551663458112 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.510000 140551663458112 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:05:26.510000 140551663458112 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.563000 140057300637504 torch/distributed/run.py:757] -W0703 03:05:26.563000 140057300637504 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.563000 140057300637504 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:05:26.563000 140057300637504 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.864000 140472173479744 torch/distributed/run.py:757] -W0703 03:05:26.864000 140472173479744 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.864000 140472173479744 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:05:26.864000 140472173479744 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.882000 140185577748288 torch/distributed/run.py:757] -W0703 03:05:26.882000 140185577748288 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.882000 140185577748288 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:05:26.882000 140185577748288 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.887000 140357741373248 torch/distributed/run.py:757] -W0703 03:05:26.887000 140357741373248 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.887000 140357741373248 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:05:26.887000 140357741373248 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.891000 140159660582720 torch/distributed/run.py:757] -W0703 03:05:26.891000 140159660582720 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.891000 140159660582720 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:05:26.891000 140159660582720 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.928000 140086511552320 torch/distributed/run.py:757] -W0703 03:05:26.928000 140086511552320 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.928000 140086511552320 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:05:26.928000 140086511552320 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.932000 140132832876352 torch/distributed/run.py:757] -W0703 03:05:26.932000 140132832876352 torch/distributed/run.py:757] ***************************************** -W0703 03:05:26.932000 140132832876352 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:05:26.932000 140132832876352 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:05:52 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config: -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: run='%date_%jobid', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: step=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: consumed_train_samples=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: benchmark_csv_path=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp=64, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp_engine=, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_mode=, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: expert_parallel_size=1), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50304), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_revision=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_max_length=None), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoint_interval=100000, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: save_initial_state=False, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: log_level_replica='info', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: train_steps=20, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: micro_batch_size=4, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: batch_accumulation_per_replica=256, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: val_check_interval=-1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_val_batches=0, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_test_batches=0), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta1=0.9, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta2=0.95, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: name='adamW'), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: zero_stage=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: weight_decay=0.01, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: clip_grad=1.0, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_steps=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_style='linear', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_style='linear', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_steps=19, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: start_training_step=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_splits='train', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: text_column_name='text'), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_loading_workers=0))], -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4')), -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lighteval=None) -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Model Config: -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50304) -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Building model.. -[default0]:07/03/2024 03:05:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Setting PP block ranks... -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: model = self._init_model_instance() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank25]: model = self._init_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: model = self._init_model_instance() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: model = build_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank31]: model = self._init_model_instance() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: model = build_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: model = self._init_model_instance() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: model = self._init_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: Traceback (most recent call last): -[default3]:[rank27]: model = build_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: model = self._init_model_instance() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: model = self._init_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: model = self._init_model_instance() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default2]:[rank58]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank28]: model = self._init_model_instance() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank28]: model = build_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank62]: model = self._init_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: model = build_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: model = build_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank24]: model = self._init_model_instance() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: model = build_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: model = self._init_model_instance() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: model = self._init_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: model = build_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: model = self._init_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: model = build_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[def[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: model = self._init_model_instance() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: model = self._init_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = self._init_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: model = build_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank57]: model = self._init_model_instance() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank57]: model = build_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: model = self._init_model_instance() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: model = self._init_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: model = build_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank47]: model = self._init_model_instance() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: model = build_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: model = build_model( -[def[default0]:[rank0]: Traceback (most recent call last): -ault0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: model = build_model( -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default1]:[rank1]: Traceback (most recent call last): -[default4]:[rank4]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: model = self._init_model_instance() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank38]: model = build_model( -[def[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: model = self._init_model_instance() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: model = self._init_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank44]: model = build_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank19]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: model = self._init_model( -ault6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank19]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: model = self._init_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: model = build_model( -[def[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -ault7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: model = self._init_model_instance() -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: model = self._init_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: model = build_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: model = self._init_model_instance() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: Traceback (most recent call last): -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: model = self._init_model_instance() -[default5]:[rank45]: model = self._init_model_instance() -[default5]:[rank5]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: model = build_model( -[def[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank45]: model = self._init_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -ault7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: model = self._init_model( -[default1]:[rank41]: model = self._init_model_instance() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: model = build_model( -[default7]:[rank7]: model = self._init_model_instance() -[default5]:[rank5]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: model = build_model( -[default1]:[rank1]: model = self._init_model( -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model_instance() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank22]: model = build_model( -[def[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: model = build_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: model = self._init_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: model = self._init_model( -[default5]:[rank45]: model = build_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: model = build_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank1]: model = build_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: model = build_model( -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank43]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default5]:[rank21]: Traceback (most recent call last): -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank54]: model = self._init_model_instance() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank54]: model = self._init_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank52]: model = self._init_model_instance() -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank52]: model = build_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: model = build_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: model = self._init_model_instance() -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank20]: model = build_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank21]: model = self._init_model_instance() -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: model = self._init_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = build_model( -[default1]:[ran[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -k9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: Traceback (most recent call last): -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = build_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: model = self._init_model_instance() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank35]: model = self._init_model( -[default2]:[rank2]: model = build_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank35]: model = build_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: model = self._init_model( -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: model = self._init_model_instance() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = build_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank8]: Traceback (most recent call last): -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank13]: Traceback (most recent call last): -[default6]:[rank14]: model = build_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: model = self._init_model_instance() -[default0]:[rank8]: model = self._init_model_instance() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: model = self._init_model_instance() -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: model = build_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: model = build_model( -[default0]:[rank8]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: model = self._init_model_instance() -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: model = self._init_model( -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: model = build_model( -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: model = build_model( -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: model = self._init_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: model = self._init_model( -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: model = build_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: model = self._init_model_instance() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = self._init_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = build_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: model = build_model( -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: model = self._init_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: model = build_model( -[def[default3]:[rank3]: model = self._init_model( -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -ault1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: model = build_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: model = build_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: model = self._init_model( -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: model = build_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: model = self._init_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank6]: model = build_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: model = self._init_model_instance() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank42]: model = self._init_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: model = build_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0703 03:05:58.255000 140132832876352 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 884735) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-103.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 884736) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-103.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 884737) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-103.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 884738) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-103.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 884739) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-103.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 884740) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-103.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 884741) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-103.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 884742) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-103.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 884735) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 03:05:58.351000 140159660582720 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 680884) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:05:58.352000 140472173479744 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 19203) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:05:58.353000 140086511552320 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3780535) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:05:58.354000 140185577748288 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 898300) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:05:58.354000 140057300637504 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1436450) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:05:58.354000 140551663458112 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1160130) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:05:58.358000 140357741373248 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3909934) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-153.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 1436451) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-153.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 1436452) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-153.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 1436453) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 1436454) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-153.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 1436455) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-153.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 1436456) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-153.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 1436457) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 1436450) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-102.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 3780536) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-102.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 3780537) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-102.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 3780538) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-102.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 3780539) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-102.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 3780540) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-102.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 3780541) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-102.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 3780542) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-102.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 3780535) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:05:58 - host : ip-26-0-166-125.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 19204) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:05:58 - host : ip-26-0-166-125.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 19205) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:05:58 - host : ip-26-0-166-125.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 19206) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:05:58 - host : ip-26-0-166-125.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 19207) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:05:58 - host : ip-26-0-166-125.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 19208) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:05:58 - host : ip-26-0-166-125.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 19209) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:05:58 - host : ip-26-0-166-125.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 19210) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:05:58 - host : ip-26-0-166-125.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 19203) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-138.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 680885) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-138.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 680886) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-138.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 680887) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-138.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 680888) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-138.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 680889) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-138.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 680890) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-138.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 680891) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-138.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 680884) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 898301) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 898302) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 898303) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 898304) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 898305) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 898306) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 898307) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 898300) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 3909935) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-62.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 3909936) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 3909937) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 3909938) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-62.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 3909939) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 3909940) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-62.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 3909941) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:05:58 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 3909934) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1160131) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-78.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1160132) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1160133) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1160134) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1160135) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-78.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1160136) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1160137) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:05:58 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1160130) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-103: task 1: Exited with exit code 1 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 0: Exited with exit code 1 -srun: error: ip-26-0-166-125: task 4: Exited with exit code 1 -srun: error: ip-26-0-161-138: task 2: Exited with exit code 1 -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-4/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/bench.slurm deleted file mode 100644 index 6dfc7a1d61a9298bd90d6bec9e64e8ce0e2a38e1..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/config.yaml deleted file mode 100644 index 609883bc42682df629b04d078f8e946e760b9a83..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 2 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 512 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/log.out deleted file mode 100644 index 9683047c074f7d5656005fce1f25c76c289117e6..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Wed Jul 3 02:10:42 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 02:10:48.016000 139749286176576 torch/distributed/run.py:757] -W0703 02:10:48.016000 139749286176576 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.016000 139749286176576 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:10:48.016000 139749286176576 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.014000 139913838036800 torch/distributed/run.py:757] -W0703 02:10:48.014000 139913838036800 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.014000 139913838036800 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:10:48.014000 139913838036800 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.016000 139632837773120 torch/distributed/run.py:757] -W0703 02:10:48.016000 139632837773120 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.016000 139632837773120 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:10:48.016000 139632837773120 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.188000 140092932007744 torch/distributed/run.py:757] -W0703 02:10:48.188000 140092932007744 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.188000 140092932007744 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:10:48.188000 140092932007744 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.195000 139718415808320 torch/distributed/run.py:757] -W0703 02:10:48.195000 139718415808320 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.195000 139718415808320 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:10:48.195000 139718415808320 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.379000 140081764083520 torch/distributed/run.py:757] -W0703 02:10:48.379000 140081764083520 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.379000 140081764083520 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:10:48.379000 140081764083520 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.382000 139648528856896 torch/distributed/run.py:757] -W0703 02:10:48.382000 139648528856896 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.382000 139648528856896 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:10:48.382000 139648528856896 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.882000 140477750601536 torch/distributed/run.py:757] -W0703 02:10:48.882000 140477750601536 torch/distributed/run.py:757] ***************************************** -W0703 02:10:48.882000 140477750601536 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 02:10:48.882000 140477750601536 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 02:11:13 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=64, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=512, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=2, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512')), -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304) -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 02:11:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: model = self._init_model_instance() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: model = build_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: model = self._init_model_instance() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: model = self._init_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: model = build_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: model = self._init_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = build_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: model = self._init_model_instance() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = build_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: model = build_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = self._init_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: model = build_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: model = self._init_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: model = build_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank21]: model = self._init_model_instance() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: model = self._init_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: model = build_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank20]: model = self._init_model_instance() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank20]: model = build_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: Traceback (most recent call last): -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model_instance() -[default3]:[rank19]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: model = self._init_model( -[default6]:[rank22]: model = self._init_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank19]: model = build_model( -[default6]:[rank22]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: model = self._init_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: model = build_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: model = build_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank47]: model = self._init_model_instance() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: model = build_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank45]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: model = build_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank43]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: model = build_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: model = self._init_model_instance() -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: model = self._init_model_instance() -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank42]: model = self._init_model( -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: model = self._init_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: model = build_model( -[default2]:[rank42]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: model = self._init_model_instance() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: model = self._init_model( -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: model = self._init_model_instance() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: model = self._init_model_instance() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = self._init_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: model = build_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: model = build_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: model = self._init_model_instance() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: model = self._init_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: model = build_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank54]: model = self._init_model_instance() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank54]: model = self._init_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank52]: model = self._init_model_instance() -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: model = build_model( -[default0]:[ran[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -k0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank52]: model = build_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = build_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: Traceback (most recent call last): -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: model = self._init_model_instance() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: model = self._init_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank3]: model = build_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: model = build_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: model = self._init_model_instance() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: model = self._init_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: model = build_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank6]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: model = self._init_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: model = build_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: Traceback (most recent call last): -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank35]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank35]: model = build_model( -[def[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: Traceback (most recent call last): -ault3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: Traceback (most recent call last): -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: model = self._init_model( -[default5]:[rank37]: model = self._init_model_instance() -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: model = self._init_model_instance() -[default7]:[rank55]: model = build_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: model = self._init_model_instance() -[default4]:[rank36]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: model = self._init_model( -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: model = self._init_model( -[default2]:[rank50]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: model = build_model( -[default3]:[rank51]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: model = build_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default6]:[rank38]: model = build_model( -[default3]:[rank51]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: model = build_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: model = self._init_model_instance() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default7]:[rank39]: model = self._init_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: model = build_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: model = self._init_model_instance() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: model = self._init_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: model = build_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: model = self._init_model_instance() -[default3]:[rank51]: model = build_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank2]: model = self._init_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank2]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: Traceback (most recent call last): -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank4]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: model = self._init_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank4]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: model = self._init_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank34]: model = build_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default0]:[rank56]: Traceback (most recent call last): -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank58]: model = self._init_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: model = build_model( -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank57]: model = self._init_model_instance() -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: model = self._init_model_instance() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank60]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: model = self._init_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: model = build_model( -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: model = self._init_model_instance() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: model = build_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: model = self._init_model_instance() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: model = self._init_model( -[default3]:[rank59]: model = build_model( -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: model = build_model( -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: model = self._init_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: model = build_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default5]:[rank61]: model = build_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: model = self._init_model_instance() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default1]:[rank25]: model = self._init_model_instance() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: model = self._init_model( -[default7]:[rank31]: model = self._init_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: model = self._init_model_instance() -[default1]:[rank25]: model = build_model( -[default7]:[rank31]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: model = self._init_model( -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: Traceback (most recent call last): -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank28]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: model = build_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank24]: model = self._init_model_instance() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: model = self._init_model_instance() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: model = build_model( -[default3]:[rank27]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank27]: model = build_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0703 02:11:20.244000 139913838036800 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 890269) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 890270) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-73.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 890271) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-73.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 890272) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-73.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 890273) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-73.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 890274) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-73.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 890275) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-73.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 890276) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-73.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 890269) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 02:11:20.339000 140081764083520 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 511422) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:11:20.340000 139749286176576 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3207448) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:11:20.338000 140092932007744 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 761350) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:11:20.340000 140477750601536 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1848168) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:11:20.340000 139632837773120 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1047668) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:11:20.342000 139718415808320 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1124397) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 02:11:20.341000 139648528856896 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1821919) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:11:20 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 511423) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:11:20 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 511424) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:11:20 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 511425) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:11:20 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 511426) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:11:20 - host : ip-26-0-161-178.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 511427) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:11:20 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 511428) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:11:20 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 511429) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:11:20 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 511422) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-226.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 3207449) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-226.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 3207450) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-226.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 3207451) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-226.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 3207452) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-226.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 3207453) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-226.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 3207454) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-226.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 3207455) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-226.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 3207448) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:11:20 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1124398) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:11:20 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1124399) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:11:20 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1124400) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:11:20 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1124401) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:11:20 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1124402) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:11:20 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1124403) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:11:20 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1124404) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:11:20 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1124397) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-220.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 761351) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-220.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 761352) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-220.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 761353) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-220.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 761354) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-220.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 761355) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-220.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 761356) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-220.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 761357) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:11:20 - host : ip-26-0-163-220.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 761350) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 1047669) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 1047670) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 1047671) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 1047672) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 1047673) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 1047674) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 1047675) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:11:20 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 1047668) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:11:20 - host : ip-26-0-168-238.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 1848169) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:11:20 - host : ip-26-0-168-238.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 1848170) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:11:20 - host : ip-26-0-168-238.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 1848171) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:11:20 - host : ip-26-0-168-238.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 1848172) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:11:20 - host : ip-26-0-168-238.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 1848173) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:11:20 - host : ip-26-0-168-238.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 1848174) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:11:20 - host : ip-26-0-168-238.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 1848175) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:11:20 - host : ip-26-0-168-238.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 1848168) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_02:11:20 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 1821920) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_02:11:20 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 1821921) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_02:11:20 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 1821922) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_02:11:20 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 1821923) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_02:11:20 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 1821924) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_02:11:20 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 1821925) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_02:11:20 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 1821926) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_02:11:20 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 1821919) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 2: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 3: Exited with exit code 1 -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-512/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/bench.slurm deleted file mode 100644 index 93c1afe09745c3fbf7226d99c100151214e51bb1..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/config.yaml deleted file mode 100644 index 617e4a3fd9c3f2f01e3bd3af65c948e7be7ac3c6..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 16 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 64 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/log.out deleted file mode 100644 index 9ee99e4c809005f2e7454ee4736c3d7616e65fc7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:02:26 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:02:31.902000 140496443356992 torch/distributed/run.py:757] -W0703 09:02:31.902000 140496443356992 torch/distributed/run.py:757] ***************************************** -W0703 09:02:31.902000 140496443356992 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:02:31.902000 140496443356992 torch/distributed/run.py:757] ***************************************** -W0703 09:02:32.072000 139836214613824 torch/distributed/run.py:757] -W0703 09:02:32.072000 139836214613824 torch/distributed/run.py:757] ***************************************** -W0703 09:02:32.072000 139836214613824 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:02:32.072000 139836214613824 torch/distributed/run.py:757] ***************************************** -W0703 09:02:32.314000 140451541763904 torch/distributed/run.py:757] -W0703 09:02:32.314000 140451541763904 torch/distributed/run.py:757] ***************************************** -W0703 09:02:32.314000 140451541763904 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:02:32.314000 140451541763904 torch/distributed/run.py:757] ***************************************** -W0703 09:02:32.327000 140712798037824 torch/distributed/run.py:757] -W0703 09:02:32.327000 140712798037824 torch/distributed/run.py:757] ***************************************** -W0703 09:02:32.327000 140712798037824 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:02:32.327000 140712798037824 torch/distributed/run.py:757] ***************************************** -W0703 09:02:33.106000 140086218258240 torch/distributed/run.py:757] -W0703 09:02:33.106000 140086218258240 torch/distributed/run.py:757] ***************************************** -W0703 09:02:33.106000 140086218258240 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:02:33.106000 140086218258240 torch/distributed/run.py:757] ***************************************** -W0703 09:02:33.110000 140006505363264 torch/distributed/run.py:757] -W0703 09:02:33.110000 140006505363264 torch/distributed/run.py:757] ***************************************** -W0703 09:02:33.110000 140006505363264 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:02:33.110000 140006505363264 torch/distributed/run.py:757] ***************************************** -W0703 09:02:33.228000 139715424569152 torch/distributed/run.py:757] -W0703 09:02:33.228000 139715424569152 torch/distributed/run.py:757] ***************************************** -W0703 09:02:33.228000 139715424569152 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:02:33.228000 139715424569152 torch/distributed/run.py:757] ***************************************** -W0703 09:02:33.476000 139959100401472 torch/distributed/run.py:757] -W0703 09:02:33.476000 139959100401472 torch/distributed/run.py:757] ***************************************** -W0703 09:02:33.476000 139959100401472 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:02:33.476000 139959100401472 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:02:59 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=64, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50304), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=64, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=16, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64')), -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50304) -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 09:02:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: model = self._init_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: model = build_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank14]: model = self._init_model_instance() -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: model = build_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank11]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: model = self._init_model_instance() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank11]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: model = self._init_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = build_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank11]: model = self._init_model( -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: model = build_model( -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: model = self._init_model_instance() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: model = build_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank8]: self.attn = CausalSelfAttention( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: model = build_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = build_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: model = self._init_model_instance() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank2]: model = self._init_model( -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank6]: model = self._init_model_instance() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: model = self._init_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: model = build_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: model = build_model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: Traceback (most recent call last): -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: model = build_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: model = self._init_model_instance() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: model = self._init_model_instance() -[default1]:[rank41]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: model = self._init_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: model = build_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank62]: model = self._init_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: model = build_model( -[def[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -ault6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: model = self._init_model_instance() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[def[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -ault0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default2]:[rank42]: model = self._init_model_instance() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank28]: model = self._init_model_instance() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank28]: model = build_model( -[def[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: model = self._init_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: model = build_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank46]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -ault4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: model = self._init_model_instance() -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: model = self._init_model_instance() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: model = self._init_model( -[default7]:[rank47]: model = build_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: model = build_model( -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank26]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: model = self._init_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: model = build_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: model = self._init_model_instance() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default1]:[rank25]: model = self._init_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank25]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model_instance() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank63]: model = self._init_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: model = build_model( -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank31]: model = self._init_model_instance() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: model = build_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: model = self._init_model_instance() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: model = self._init_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: model = build_model( -[default1]:[ran[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -k1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank4]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: model = self._init_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank24]: model = self._init_model_instance() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: model = build_model( -[def[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank4]: self.attn = CausalSelfAttention( -ault0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: model = build_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank58]: model = self._init_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: Traceback (most recent call last): -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model_instance() -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model( -[default5]:[rank61]: model = self._init_model_instance() -[default1]:[rank57]: model = self._init_model_instance() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank57]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: model = build_model( -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: model = self._init_model_instance() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: model = self._init_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: model = build_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: model = self._init_model_instance() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: model = self._init_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank3]: model = build_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: model = build_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank43]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: model = build_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: model = self._init_model_instance() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: model = self._init_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank44]: model = build_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: model = self._init_model_instance() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: model = self._init_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: model = build_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: model = self._init_model( -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: model = self._init_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank21]: model = self._init_model_instance() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[def[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -ault5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: Traceback (most recent call last): -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: model = self._init_model_instance() -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank54]: model = self._init_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank45]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: Traceback (most recent call last): -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: model = build_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank52]: model = self._init_model_instance() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: model = build_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: model = self._init_model_instance() -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = self._init_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: model = build_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = build_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: model = self._init_model_instance() -[default6]:[rank22]: Traceback (most recent call last): -[default4]:[rank20]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: model = build_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank20]: model = build_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: model = build_model( -[def[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -ault1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model_instance() -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: model = self._init_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default6]:[rank22]: model = build_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank35]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank35]: model = build_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: model = self._init_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank34]: model = build_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: model = build_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: model = self._init_model_instance() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: model = self._init_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: model = build_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: model = self._init_model_instance() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank37]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: model = build_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: model = self._init_model_instance() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = self._init_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: model = build_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: model = self._init_model_instance() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank38]: model = build_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: model = build_model( -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank19]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: model = self._init_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank19]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: model = build_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0703 09:03:05.562000 139715424569152 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 955975) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:03:05.561000 140006505363264 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 738333) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:03:05.562000 140496443356992 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 836435) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:03:05.562000 140451541763904 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 199873) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:03:05.563000 139836214613824 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 853779) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:03:05.565000 139959100401472 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 941250) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:03:05.567000 140086218258240 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 70279) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:03:05.566000 140712798037824 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 466361) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:03:05 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 70280) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:03:05 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 70281) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:03:05 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 70282) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:03:05 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 70283) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:03:05 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 70284) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:03:05 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 70285) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:03:05 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 70286) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.h return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -tml ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:03:05 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 70279) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:03:05 - host : ip-26-0-166-125.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 199874) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:03:05 - host : ip-26-0-166-125.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 199875) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:03:05 - host : ip-26-0-166-125.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 199876) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:03:05 - host : ip-26-0-166-125.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 199877) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:03:05 - host : ip-26-0-166-125.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 199878) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:03:05 - host : ip-26-0-166-125.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 199879) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:03:05 - host : ip-26-0-166-125.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 199880) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/el raise ChildFailedError( -astic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:03:05 - host : ip-26-0-166-125.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 199873) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:03:05 - host : ip-26-0-163-147.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 853780) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:03:05 - host : ip-26-0-163-147.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 853781) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:03:05 - host : ip-26-0-163-147.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 853782) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:03:05 - host : ip-26-0-163-147.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 853783) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:03:05 - host : ip-26-0-163-147.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 853784) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:03:05 - host : ip-26-0-163-147.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 853785) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:03:05 - host : ip-26-0-163-147.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 853786) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:03:05 - host : ip-26-0-163-147.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 853779) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:03:05 - host : ip-26-0-165-24.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 955976) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:03:05 - host : ip-26-0-165-24.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 955977) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:03:05 - host : ip-26-0-165-24.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 955978) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:03:05 - host : ip-26-0-165-24.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 955979) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:03:05 - host : ip-26-0-165-24.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 955980) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:03:05 - host : ip-26-0-165-24.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 955981) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:03:05 - host : ip-26-0-165-24.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 955982) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:03:05 - host : ip-26-0-165-24.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 955975) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 941251) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 941252) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 941253) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 941254) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 941255) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 941256) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 941257) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/ela elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -stic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 941250) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:03:05 - host : ip-26-0-164-207.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 466362) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:03:05 - host : ip-26-0-164-207.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 466363) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:03:05 - host : ip-26-0-164-207.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 466364) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:03:05 - host : ip-26-0-164-207.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 466365) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:03:05 - host : ip-26-0-164-207.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 466366) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:03:05 - host : ip-26-0-164-207.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 466367) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:03:05 - host : ip-26-0-164-207.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 466368) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:03:05 - host : ip-26-0-164-207.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 466361) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:03:05 - host : ip-26-0-167-177.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 836436) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:03:05 - host : ip-26-0-167-177.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 836437) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:03:05 - host : ip-26-0-167-177.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 836438) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:03:05 - host : ip-26-0-167-177.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 836439) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:03:05 - host : ip-26-0-167-177.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 836440) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:03:05 - host : ip-26-0-167-177.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 836441) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:03:05 - host : ip-26-0-167-177.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 836442) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:03:05 - host : ip-26-0-167-177.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 836435) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-138.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 738334) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-138.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 738335) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-138.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 738336) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-138.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 738337) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-138.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 738338) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-138.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 738339) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-138.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 738340) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:03:05 - host : ip-26-0-161-138.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 738333) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-166-125: task 6: Exited with exit code 1 -srun: error: ip-26-0-165-24: task 5: Exited with exit code 1 -srun: error: ip-26-0-163-147: task 3: Exited with exit code 1 -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 1: Exited with exit code 1 -srun: error: ip-26-0-161-138: task 2: Exited with exit code 1 -srun: error: ip-26-0-167-177: task 7: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-64/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/bench.slurm b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/bench.slurm deleted file mode 100644 index c54ef57f94536fb3ef65f60b7b5872e6bfd3f9c6..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8 llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/config.yaml b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/config.yaml deleted file mode 100644 index 83f4bc418e093f85c45c16111f7f462cacc528d3..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 1 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 64 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 128 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 8 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/log.out b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/log.out deleted file mode 100644 index 7ad1f39ae6c9262a483437cbd86a45b1fbe5fba8..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/log.out +++ /dev/null @@ -1,2068 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:42:12 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:42:18.640000 140425593186112 torch/distributed/run.py:757] -W0703 09:42:18.640000 140425593186112 torch/distributed/run.py:757] ***************************************** -W0703 09:42:18.640000 140425593186112 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:18.640000 140425593186112 torch/distributed/run.py:757] ***************************************** -W0703 09:42:18.641000 140680035198784 torch/distributed/run.py:757] -W0703 09:42:18.641000 140680035198784 torch/distributed/run.py:757] ***************************************** -W0703 09:42:18.641000 140680035198784 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:18.641000 140680035198784 torch/distributed/run.py:757] ***************************************** -W0703 09:42:18.638000 139863566198592 torch/distributed/run.py:757] -W0703 09:42:18.638000 139863566198592 torch/distributed/run.py:757] ***************************************** -W0703 09:42:18.638000 139863566198592 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:18.638000 139863566198592 torch/distributed/run.py:757] ***************************************** -W0703 09:42:18.639000 140239149406016 torch/distributed/run.py:757] -W0703 09:42:18.639000 140239149406016 torch/distributed/run.py:757] ***************************************** -W0703 09:42:18.639000 140239149406016 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:18.639000 140239149406016 torch/distributed/run.py:757] ***************************************** -W0703 09:42:18.965000 139684394833728 torch/distributed/run.py:757] -W0703 09:42:18.965000 139684394833728 torch/distributed/run.py:757] ***************************************** -W0703 09:42:18.965000 139684394833728 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:18.965000 139684394833728 torch/distributed/run.py:757] ***************************************** -W0703 09:42:20.088000 140270174660416 torch/distributed/run.py:757] -W0703 09:42:20.088000 140270174660416 torch/distributed/run.py:757] ***************************************** -W0703 09:42:20.088000 140270174660416 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:20.088000 140270174660416 torch/distributed/run.py:757] ***************************************** -W0703 09:42:20.134000 140281013716800 torch/distributed/run.py:757] -W0703 09:42:20.134000 140281013716800 torch/distributed/run.py:757] ***************************************** -W0703 09:42:20.134000 140281013716800 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:20.134000 140281013716800 torch/distributed/run.py:757] ***************************************** -W0703 09:42:20.391000 139850624862016 torch/distributed/run.py:757] -W0703 09:42:20.391000 139850624862016 torch/distributed/run.py:757] ***************************************** -W0703 09:42:20.391000 139850624862016 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:42:20.391000 139850624862016 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:42:45 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=64, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=8, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=128, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8')), -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50304) -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 09:42:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank48]: trainer = DistributedTrainer(config_file) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank48]: self.model = self.init_model() # Defines self.model -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank48]: model = self._init_model_instance() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank48]: model = self._init_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank48]: model = build_model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank48]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank48]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank48]: self.attn = CausalSelfAttention( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank48]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank48]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank5]: trainer = DistributedTrainer(config_file) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank5]: self.model = self.init_model() # Defines self.model -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank5]: model = self._init_model_instance() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank5]: model = self._init_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank5]: model = build_model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank5]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank5]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank5]: self.attn = CausalSelfAttention( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank5]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank5]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: trainer = DistributedTrainer(config_file) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank2]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: trainer = DistributedTrainer(config_file) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank4]: self.model = self.init_model() # Defines self.model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank4]: model = self._init_model_instance() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: model = self._init_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: model = build_model( -[default2]:[rank2]: model = self._init_model_instance() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: model = self._init_model( -[default4]:[rank4]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank2]: model = build_model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank2]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank4]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank4]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank4]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank2]: self.attn = CausalSelfAttention( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank2]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank2]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank53]: trainer = DistributedTrainer(config_file) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank53]: self.model = self.init_model() # Defines self.model -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank53]: model = self._init_model_instance() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank53]: model = self._init_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank53]: model = build_model( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank53]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank53]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank53]: self.attn = CausalSelfAttention( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank53]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank53]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank49]: trainer = DistributedTrainer(config_file) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank49]: self.model = self.init_model() # Defines self.model -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank49]: model = self._init_model_instance() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank49]: model = self._init_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank49]: model = build_model( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank49]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank49]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank49]: self.attn = CausalSelfAttention( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank49]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank49]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank3]: trainer = DistributedTrainer(config_file) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank3]: self.model = self.init_model() # Defines self.model -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank3]: model = self._init_model_instance() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank3]: model = self._init_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank3]: model = build_model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank3]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank3]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank3]: self.attn = CausalSelfAttention( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank3]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank3]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank50]: trainer = DistributedTrainer(config_file) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank50]: self.model = self.init_model() # Defines self.model -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank50]: model = self._init_model_instance() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank50]: model = self._init_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank50]: model = build_model( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank50]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank50]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank50]: self.attn = CausalSelfAttention( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank50]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank50]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank52]: trainer = DistributedTrainer(config_file) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank52]: self.model = self.init_model() # Defines self.model -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank52]: model = self._init_model_instance() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank52]: model = self._init_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank52]: model = build_model( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank52]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank52]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank52]: self.attn = CausalSelfAttention( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank52]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank52]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank59]: trainer = DistributedTrainer(config_file) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank59]: self.model = self.init_model() # Defines self.model -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank59]: model = self._init_model_instance() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank59]: model = self._init_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank59]: model = build_model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank59]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank59]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank59]: self.attn = CausalSelfAttention( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank59]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank59]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank1]: trainer = DistributedTrainer(config_file) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank1]: self.model = self.init_model() # Defines self.model -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank1]: model = self._init_model_instance() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank1]: model = self._init_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank1]: model = build_model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank1]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank1]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank1]: self.attn = CausalSelfAttention( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank1]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank1]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank0]: trainer = DistributedTrainer(config_file) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank0]: self.model = self.init_model() # Defines self.model -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank0]: model = self._init_model_instance() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank0]: model = self._init_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank0]: model = build_model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank55]: trainer = DistributedTrainer(config_file) -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank62]: trainer = DistributedTrainer(config_file) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank62]: self.model = self.init_model() # Defines self.model -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank62]: model = self._init_model_instance() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank62]: model = self._init_model( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank62]: model = build_model( -[def[default0]:[rank0]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank0]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank0]: self.attn = CausalSelfAttention( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank0]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank0]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -ault6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank62]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank62]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank62]: self.attn = CausalSelfAttention( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank62]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank62]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank55]: self.model = self.init_model() # Defines self.model -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank55]: model = self._init_model_instance() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank55]: model = self._init_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank55]: model = build_model( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank55]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank55]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank55]: self.attn = CausalSelfAttention( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank55]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank55]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank51]: trainer = DistributedTrainer(config_file) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank51]: self.model = self.init_model() # Defines self.model -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank51]: model = self._init_model_instance() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank51]: model = self._init_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank51]: model = build_model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank51]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank51]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank51]: self.attn = CausalSelfAttention( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank51]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank51]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank54]: trainer = DistributedTrainer(config_file) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: trainer = DistributedTrainer(config_file) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank56]: self.model = self.init_model() # Defines self.model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: model = self._init_model_instance() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank56]: model = self._init_model( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank56]: model = build_model( -[default6]:[rank54]: self.model = self.init_model() # Defines self.model -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank56]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank56]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank56]: self.attn = CausalSelfAttention( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: model = self._init_model_instance() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank54]: model = self._init_model( -[default0]:[rank56]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank56]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank54]: model = build_model( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank54]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank54]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank54]: self.attn = CausalSelfAttention( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank54]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank54]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank58]: trainer = DistributedTrainer(config_file) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank58]: self.model = self.init_model() # Defines self.model -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank58]: model = self._init_model_instance() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank58]: model = self._init_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank58]: model = build_model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank58]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank58]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank58]: self.attn = CausalSelfAttention( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank58]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank58]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: trainer = DistributedTrainer(config_file) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank61]: self.model = self.init_model() # Defines self.model -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank61]: model = self._init_model_instance() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank61]: model = self._init_model( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank61]: model = build_model( -[def[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank7]: trainer = DistributedTrainer(config_file) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank7]: self.model = self.init_model() # Defines self.model -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank7]: model = self._init_model_instance() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank7]: model = self._init_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank7]: model = build_model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -ault5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank61]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank7]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank7]: self.attn = CausalSelfAttention( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank7]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank7]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank24]: trainer = DistributedTrainer(config_file) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank24]: self.model = self.init_model() # Defines self.model -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank24]: model = self._init_model_instance() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank24]: model = self._init_model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank24]: model = build_model( -[def[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank61]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank61]: self.attn = CausalSelfAttention( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank61]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank61]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank34]: trainer = DistributedTrainer(config_file) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank34]: self.model = self.init_model() # Defines self.model -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank34]: model = self._init_model_instance() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank34]: model = self._init_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank34]: model = build_model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank34]: block.build_and_s[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank6]: trainer = DistributedTrainer(config_file) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: self.model = self.init_model() # Defines self.model -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank6]: model = self._init_model_instance() -ault0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank24]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank24]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank24]: self.attn = CausalSelfAttention( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank24]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank24]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -et_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank34]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank34]: self.attn = CausalSelfAttention( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank34]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank34]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank63]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank6]: model = self._init_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank6]: model = build_model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank6]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank6]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank6]: self.attn = CausalSelfAttention( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank6]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank6]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank63]: self.model = self.init_model() # Defines self.model -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank63]: model = self._init_model_instance() -[default4]:[rank60]: trainer = DistributedTrainer(config_file) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank57]: trainer = DistributedTrainer(config_file) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank60]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank63]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank60]: model = self._init_model_instance() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank57]: self.model = self.init_model() # Defines self.model -[default4]:[rank60]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank63]: model = build_model( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: model = build_model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank57]: model = self._init_model_instance() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank28]: trainer = DistributedTrainer(config_file) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank28]: self.model = self.init_model() # Defines self.model -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank28]: model = self._init_model_instance() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank28]: model = self._init_model( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank28]: model = build_model( -[def[default7]:[rank63]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank63]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank57]: model = self._init_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -ault4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank28]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank28]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank28]: self.attn = CausalSelfAttention( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank28]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank28]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank60]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank63]: self.attn = CausalSelfAttention( -[default1]:[rank57]: model = build_model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank60]: self.attn = CausalSelfAttention( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank63]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank63]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank57]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank60]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank60]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank57]: self.attn = CausalSelfAttention( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank57]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank57]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank30]: trainer = DistributedTrainer(config_file) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank30]: self.model = self.init_model() # Defines self.model -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank30]: model = self._init_model_instance() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank30]: model = self._init_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank30]: model = build_model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank30]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank30]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank30]: self.attn = CausalSelfAttention( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank30]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank30]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: Traceback (most recent call last): -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: trainer = DistributedTrainer(config_file) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: self.model = self.init_model() # Defines self.model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: model = self._init_model_instance() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: model = self._init_model( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: model = build_model( -[default2]:[rank26]: trainer = DistributedTrainer(config_file) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank26]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank25]: trainer = DistributedTrainer(config_file) -[default2]:[rank26]: model = self._init_model_instance() -[default3]:[rank27]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank25]: self.model = self.init_model() # Defines self.model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: model = self._init_model( -[default1]:[rank25]: model = self._init_model_instance() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank27]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank27]: self.attn = CausalSelfAttention( -[default1]:[rank25]: model = self._init_model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank27]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank25]: model = build_model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank26]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank27]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank26]: self.attn = CausalSelfAttention( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank26]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank26]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank25]: self.attn = CausalSelfAttention( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank25]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank25]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank31]: trainer = DistributedTrainer(config_file) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank31]: self.model = self.init_model() # Defines self.model -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank31]: model = self._init_model_instance() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank31]: model = self._init_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank31]: model = build_model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank31]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank31]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank31]: self.attn = CausalSelfAttention( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank31]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank31]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank29]: trainer = DistributedTrainer(config_file) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank29]: self.model = self.init_model() # Defines self.model -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank29]: model = self._init_model_instance() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank29]: model = self._init_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank29]: model = build_model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank29]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank29]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank29]: self.attn = CausalSelfAttention( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank29]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank29]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank20]: trainer = DistributedTrainer(config_file) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank20]: self.model = self.init_model() # Defines self.model -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank20]: model = self._init_model_instance() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank20]: model = self._init_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank20]: model = build_model( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank20]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank20]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank20]: self.attn = CausalSelfAttention( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank20]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank20]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank18]: trainer = DistributedTrainer(config_file) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank18]: self.model = self.init_model() # Defines self.model -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank18]: model = self._init_model_instance() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank18]: model = self._init_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank18]: model = build_model( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank18]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank18]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank18]: self.attn = CausalSelfAttention( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank18]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank21]: trainer = DistributedTrainer(config_file) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank21]: self.model = self.init_model() # Defines self.model -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank21]: model = self._init_model_instance() -[default2]:[rank18]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank21]: model = self._init_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank21]: model = build_model( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank21]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank21]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank21]: self.attn = CausalSelfAttention( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank21]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank21]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank8]: trainer = DistributedTrainer(config_file) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank8]: self.model = self.init_model() # Defines self.model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank8]: model = self._init_model_instance() -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank22]: trainer = DistributedTrainer(config_file) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank22]: self.model = self.init_model() # Defines self.model -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank22]: model = self._init_model_instance() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank22]: model = self._init_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank22]: model = build_model( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank22]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank44]: trainer = DistributedTrainer(config_file) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank44]: self.model = self.init_model() # Defines self.model -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank44]: model = self._init_model_instance() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank44]: model = self._init_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank8]: model = self._init_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank8]: model = build_model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank8]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank8]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank8]: self.attn = CausalSelfAtte[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank22]: self.pp_block = self.module_builder(**self.module_kwargs) -ntion( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank8]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank8]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank22]: self.attn = CausalSelfAttention( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: model = build_model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank44]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank44]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank44]: self.attn = CausalSelfAttention( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank44]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank44]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank22]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank22]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank42]: trainer = DistributedTrainer(config_file) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank42]: self.model = self.init_model() # Defines self.model -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank42]: model = self._init_model_instance() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank42]: model = self._init_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank42]: model = build_model( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank42]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank42]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank42]: self.attn = CausalSelfAttention( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank42]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank42]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank17]: trainer = DistributedTrainer(config_file) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank17]: self.model = self.init_model() # Defines self.model -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank17]: model = self._init_model_instance() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank17]: model = self._init_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank17]: model = build_model( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank17]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank17]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank17]: self.attn = CausalSelfAttention( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank17]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank17]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank19]: trainer = DistributedTrainer(config_file) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank19]: self.model = self.init_model() # Defines self.model -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank19]: model = self._init_model_instance() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank19]: model = self._init_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank19]: model = build_model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank19]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank19]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank19]: self.attn = CausalSelfAttention( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank19]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank19]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank13]: trainer = DistributedTrainer(config_file) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank13]: self.model = self.init_model() # Defines self.model -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank13]: model = self._init_model_instance() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank13]: model = self._init_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank13]: model = build_model( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank13]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank13]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank13]: self.attn = CausalSelfAttention( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank13]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank13]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank23]: trainer = DistributedTrainer(config_file) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank23]: self.model = self.init_model() # Defines self.model -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank23]: model = self._init_model_instance() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank23]: model = self._init_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank23]: model = build_model( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank23]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank23]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank23]: self.attn = CausalSelfAttention( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank23]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank23]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank45]: trainer = DistributedTrainer(config_file) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank45]: self.model = self.init_model() # Defines self.model -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank45]: model = self._init_model_instance() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank45]: model = self._init_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank45]: model = build_model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank45]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank45]: self.pp_block = self.module_builder(**self.module_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank45]: self.attn = CausalSelfAttention( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank45]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank45]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank32]: trainer = DistributedTrainer(config_file) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank32]: self.model = self.init_model() # Defines self.model -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank32]: model = self._init_model_instance() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank32]: model = self._init_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank32]: model = build_model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank32]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank32]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank32]: self.attn = CausalSelfAttention( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank32]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank32]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank38]: trainer = DistributedTrainer(config_file) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank38]: self.model = self.init_model() # Defines self.model -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank38]: model = self._init_model_instance() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank38]: model = self._init_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank38]: model = build_model( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank38]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank38]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank38]: self.attn = CausalSelfAttention( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank38]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank38]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank36]: trainer = DistributedTrainer(config_file) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank36]: self.model = self.init_model() # Defines self.model -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank36]: model = self._init_model_instance() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank36]: model = self._init_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank36]: model = build_model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank36]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank36]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank36]: self.attn = CausalSelfAttention( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank36]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank36]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank43]: trainer = DistributedTrainer(config_file) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank43]: self.model = self.init_model() # Defines self.model -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank43]: model = self._init_model_instance() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank43]: model = self._init_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank43]: model = build_model( -[def[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default3]:[rank35]: trainer = DistributedTrainer(config_file) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default3]:[rank35]: self.model = self.init_model() # Defines self.model -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default3]:[rank35]: model = self._init_model_instance() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default3]:[rank35]: model = self._init_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank35]: model = build_model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank43]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank43]: self.attn = CausalSelfAttention( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank16]: trainer = DistributedTrainer(config_file) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank16]: self.model = self.init_model() # Defines self.model -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank16]: model = self._init_model_instance() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank16]: model = self._init_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank16]: model = build_model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank35]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank43]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank43]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank35]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank35]: self.attn = CausalSelfAttention( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank35]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank35]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default2]:[rank10]: trainer = DistributedTrainer(config_file) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default2]:[rank10]: self.model = self.init_model() # Defines self.model -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default2]:[rank10]: model = self._init_model_instance() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default2]:[rank10]: model = self._init_model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default2]:[rank10]: model = build_model( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank16]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -ault2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default2]:[rank10]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default2]:[rank10]: self.pp_block = self.module_builder(**self.module_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default2]:[rank10]: self.attn = CausalSelfAttention( -[default0]:[rank16]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank16]: self.attn = CausalSelfAttention( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank16]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank16]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default2]:[rank10]: config.num_attention_heads % tp_pg.size() == 0 -[default2]:[rank10]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default4]:[rank12]: trainer = DistributedTrainer(config_file) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default4]:[rank12]: self.model = self.init_model() # Defines self.model -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default4]:[rank12]: model = self._init_model_instance() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default4]:[rank12]: model = self._init_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default4]:[rank12]: model = build_model( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default4]:[rank12]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default4]:[rank12]: self.pp_block = self.module_builder(**self.module_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default4]:[rank12]: self.attn = CausalSelfAttention( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default4]:[rank12]: config.num_attention_heads % tp_pg.size() == 0 -[default4]:[rank12]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank15]: trainer = DistributedTrainer(config_file) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank15]: self.model = self.init_model() # Defines self.model -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank15]: model = self._init_model_instance() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank15]: model = self._init_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank15]: model = build_model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank15]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank15]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank15]: self.attn = CausalSelfAttention( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank15]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank15]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank14]: trainer = DistributedTrainer(config_file) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank14]: self.model = self.init_model() # Defines self.model -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank14]: model = self._init_model_instance() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank14]: model = self._init_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank14]: model = build_model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank14]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank14]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank14]: self.attn = CausalSelfAttention( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank14]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank14]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank39]: trainer = DistributedTrainer(config_file) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank39]: self.model = self.init_model() # Defines self.model -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank39]: model = self._init_model_instance() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank39]: model = self._init_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank39]: model = build_model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank39]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank39]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank39]: self.attn = CausalSelfAttention( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank39]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank39]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default5]:[rank37]: trainer = DistributedTrainer(config_file) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default5]:[rank37]: self.model = self.init_model() # Defines self.model -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default5]:[rank37]: model = self._init_model_instance() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default5]:[rank37]: model = self._init_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default5]:[rank37]: model = build_model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default5]:[rank37]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default5]:[rank37]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default0]:[rank40]: trainer = DistributedTrainer(config_file) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default0]:[rank40]: self.model = self.init_model() # Defines self.model -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default0]:[rank40]: model = self._init_model_instance() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default0]:[rank40]: model = self._init_model( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default0]:[rank40]: model = build_model( -[def[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default5]:[rank37]: self.attn = CausalSelfAttention( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default5]:[rank37]: config.num_attention_heads % tp_pg.size() == 0 -[default5]:[rank37]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -ault0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default0]:[rank40]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default0]:[rank40]: self.pp_block = self.module_builder(**self.module_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default0]:[rank40]: self.attn = CausalSelfAttention( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default0]:[rank40]: config.num_attention_heads % tp_pg.size() == 0 -[default0]:[rank40]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank33]: trainer = DistributedTrainer(config_file) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank33]: self.model = self.init_model() # Defines self.model -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank33]: model = self._init_model_instance() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank33]: model = self._init_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank33]: model = build_model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank33]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank33]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank33]: self.attn = CausalSelfAttention( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank33]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank33]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank9]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: self.model = self.init_model() # Defines self.model -[default3]:[rank11]: trainer = DistributedTrainer(config_file) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: model = self._init_model_instance() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: model = self._init_model( -[default3]:[rank11]: self.model = self.init_model() # Defines self.model -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank9]: model = build_model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank9]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank9]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank9]: self.attn = CausalSelfAttention( -[default3]:[rank11]: model = self._init_model_instance() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank9]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: model = self._init_model( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default3]:[rank11]: model = build_model( -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default1]:[rank41]: trainer = DistributedTrainer(config_file) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default1]:[rank41]: self.model = self.init_model() # Defines self.model -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default1]:[rank41]: model = self._init_model_instance() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default1]:[rank41]: model = self._init_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default1]:[rank41]: model = build_model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default1]:[rank41]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default1]:[rank41]: self.pp_block = self.module_builder(**self.module_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default1]:[rank41]: self.attn = CausalSelfAttention( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default1]:[rank41]: config.num_attention_heads % tp_pg.size() == 0 -[default1]:[rank41]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default6]:[rank46]: trainer = DistributedTrainer(config_file) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default6]:[rank46]: self.model = self.init_model() # Defines self.model -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default6]:[rank46]: model = self._init_model_instance() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default6]:[rank46]: model = self._init_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default6]:[rank46]: model = build_model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default6]:[rank46]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default6]:[rank46]: self.pp_block = self.module_builder(**self.module_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default6]:[rank46]: self.attn = CausalSelfAttention( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default6]:[rank46]: config.num_attention_heads % tp_pg.size() == 0 -[default6]:[rank46]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in -[default7]:[rank47]: trainer = DistributedTrainer(config_file) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 172, in __init__ -[default7]:[rank47]: self.model = self.init_model() # Defines self.model -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 672, in init_model -[default7]:[rank47]: model = self._init_model_instance() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 682, in _init_model_instance -[default7]:[rank47]: model = self._init_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 751, in _init_model -[default7]:[rank47]: model = build_model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default7]:[rank47]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default7]:[rank47]: self.pp_block = self.module_builder(**self.module_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default7]:[rank47]: self.attn = CausalSelfAttention( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default7]:[rank47]: config.num_attention_heads % tp_pg.size() == 0 -[default7]:[rank47]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default1]:[rank9]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/base.py", line 230, in build_model -[default3]:[rank11]: block.build_and_set_rank(target_pp_ranks[target_pp_rank_idx]) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 52, in build_and_set_rank -[default3]:[rank11]: self.pp_block = self.module_builder(**self.module_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 613, in __init__ -[default3]:[rank11]: self.attn = CausalSelfAttention( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 271, in __init__ -[default3]:[rank11]: config.num_attention_heads % tp_pg.size() == 0 -[default3]:[rank11]: AssertionError: Number of attention heads (32) must be divisible by TP size (64). -E0703 09:42:51.453000 140425593186112 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2474879) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-132.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 2474880) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-132.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 2474881) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-132.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 2474882) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-132.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 2474883) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-132.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 2474884) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-132.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 2474885) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-132.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 2474886) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-132.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 2474879) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:42:51.548000 139684394833728 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1210534) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:42:51.548000 140239149406016 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 973868) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:42:51.549000 140680035198784 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1132318) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:42:51.549000 140281013716800 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3288551) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:42:51.550000 139863566198592 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1903854) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:42:51.551000 140270174660416 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 841914) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:42:51.557000 139850624862016 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1931138) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 973869) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-73.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 973870) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-73.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 973871) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-73.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 973872) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-73.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 973873) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-73.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 973874) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-73.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 973875) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-73.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 973868) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 1132319) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 1132320) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 1132321) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 1132322) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 1132323) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 1132324) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 1132325) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:42:51 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 1132318) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:42:51 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1210535) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:42:51 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1210536) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:42:51 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1210537) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:42:51 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1210538) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:42:51 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1210539) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:42:51 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1210540) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:42:51 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1210541) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/el return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent -astic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:42:51 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1210534) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 1903855) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 1903856) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 1903857) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 1903858) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 1903859) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 1903860) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 1903861) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:42:51 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 1903854) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-220.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 841915) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-220.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 841916) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-220.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 841917) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-220.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 841918) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-220.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 841919) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-220.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 841920) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-220.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 841921) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-220.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 841914) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:42:51 - host : ip-26-0-168-238.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1931139) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:42:51 - host : ip-26-0-168-238.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1931140) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:42:51 - host : ip-26-0-168-238.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1931141) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:42:51 - host : ip-26-0-168-238.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1931142) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:42:51 - host : ip-26-0-168-238.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1931143) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:42:51 - host : ip-26-0-168-238.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1931144) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:42:51 - host : ip-26-0-168-238.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1931145) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:42:51 - host : ip-26-0-168-238.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1931138) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-226.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 3288552) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-226.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 3288553) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-226.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 3288554) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-226.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 3288555) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-226.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 3288556) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-226.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 3288557) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-226.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 3288558) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:42:51 - host : ip-26-0-163-226.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 3288551) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-169-132: task 5: Exited with exit code 1 -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 4: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 1: Exited with exit code 1 -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -srun: error: ip-26-0-168-238: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/status.txt b/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/status.txt deleted file mode 100644 index f45d4b694dabe123ce5930eafeb2d3234f2a4d76..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-1_tp-64_pp-1_mbz-8/status.txt +++ /dev/null @@ -1 +0,0 @@ -fail \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/bench.slurm deleted file mode 100644 index 4e27992e0bf0dd0c936c908721695a0f4e132438..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/config.yaml deleted file mode 100644 index 1a809f6d02cef9b70e663fa6c13bc4b73f92c646..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 512 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 1 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/log.out deleted file mode 100644 index 9a126b4c8816b389538772ba773cbadda036acb5..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/log.out +++ /dev/null @@ -1,5863 +0,0 @@ -======================== -START TIME: Wed Jul 3 00:01:34 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 00:01:37.328000 140698582562624 torch/distributed/run.py:757] -W0703 00:01:37.328000 140698582562624 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.328000 140698582562624 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:01:37.328000 140698582562624 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.329000 139789579990848 torch/distributed/run.py:757] -W0703 00:01:37.329000 139789579990848 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.329000 139789579990848 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:01:37.329000 139789579990848 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.343000 139981954107200 torch/distributed/run.py:757] -W0703 00:01:37.343000 139981954107200 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.343000 139981954107200 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:01:37.343000 139981954107200 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.347000 140144279455552 torch/distributed/run.py:757] -W0703 00:01:37.347000 140144279455552 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.347000 140144279455552 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:01:37.347000 140144279455552 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.351000 140091496240960 torch/distributed/run.py:757] -W0703 00:01:37.351000 140091496240960 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.351000 140091496240960 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:01:37.351000 140091496240960 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.360000 139763312441152 torch/distributed/run.py:757] -W0703 00:01:37.360000 139763312441152 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.360000 139763312441152 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:01:37.360000 139763312441152 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.390000 139765502732096 torch/distributed/run.py:757] -W0703 00:01:37.390000 139765502732096 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.390000 139765502732096 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:01:37.390000 139765502732096 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.431000 139702622873408 torch/distributed/run.py:757] -W0703 00:01:37.431000 139702622873408 torch/distributed/run.py:757] ***************************************** -W0703 00:01:37.431000 139702622873408 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:01:37.431000 139702622873408 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 00:01:57 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=2, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=16, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=512, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1')), -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 00:01:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default0]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=8|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=13|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=15|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=2|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=12|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=11|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=9|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=10|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=14|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=6|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=5|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=1|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=5|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=1|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=3|ip-26-0-172-57]: No checkpoint path provided. -[default5]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=4|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=7|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=0|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 00:02:14 [INFO|DP=1|PP=1|TP=6|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=3|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=2|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=7|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 00:02:14 [INFO|DP=1|PP=0|TP=4|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 00:02:14 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 00:02:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 00:02:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 00:02:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 00:02:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 00:02:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 00:02:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 00:02:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 00:02:17 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:02:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 00:02:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 00:02:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 00:02:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 00:02:17.976327 | mbs: 1 | grad_accum: 512 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 00:02:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 00:02:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default2]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=2|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=12|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=14|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=5|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=6|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=1|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=3|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=7|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=0|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=3|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=2|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=7|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=15|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=8|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=11|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=9|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=10|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=1|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=5|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=4|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=13|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:02:18 [WARNING|DP=1|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=4|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:02:18 [WARNING|DP=1|PP=1|TP=6|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:02:18 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:02:18 [WARNING|DP=0|PP=1|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank26]: dist.recv( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]: Traceback (most recent call last): -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default6]:[rank30]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default3]:[rank27]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: return user_fn(self, *args) -[default6]:[rank30]: dist.recv( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default2]:[rank18]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: _engine_run_backward( -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return user_fn(self, *args) -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank19]: dist.recv( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nan[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-otron/parallel/pipeline_parallel/engine.py", line 86, in backward -hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/mi/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -niforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank24]: dist.recv( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fccda480897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fccdb759c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fccdb75ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fccdb75fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcd271f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcd2c23f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcd2c00a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fccda480897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fccdb759c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fccdb75ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fccdb75fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcd271f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcd2c23f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcd2c00a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fccda480897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fccdb3e3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fcd271f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fcd2c23f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fcd2c00a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd32b2e9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd32c5c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd32c5c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd32c5c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd378061e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd37d0a8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd37ce73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd32b2e9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd32c5c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd32c5c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd32c5c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd378061e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd37d0a8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd37ce73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd32b2e9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fd32c24c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fd378061e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fd37d0a8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fd37ce73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a28cc8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4a29fa1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4a29fa6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4a29fa7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4a75a40e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4a7aa87609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4a7a852353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a28cc8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4a29fa1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4a29fa6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4a29fa7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4a75a40e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4a7aa87609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4a7a852353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a28cc8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4a29c2b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f4a75a40e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f4a7aa87609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4a7a852353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5adee90897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5ae0169c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5ae016ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5ae016fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5b2bc08e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5b30c4f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5b30a1a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5adee90897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5ae0169c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5ae016ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5ae016fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5b2bc08e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5b30c4f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5b30a1a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5adee90897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f5adfdf3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f5b2bc08e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f5b30c4f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f5b30a1a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd241548897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd242821c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd242826a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd242827dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd28e2c0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd293307609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd2930d2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd241548897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd242821c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd242826a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd242827dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd28e2c0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd293307609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd2930d2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd241548897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fd2424ab119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fd28e2c0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fd293307609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fd2930d2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1cb0f5b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59dad64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59dc03dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59dc042a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59dc043dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1cb2234c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1cb2239a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1cb223adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f1cfdcd3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f1d02d1a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f1d02ae5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1cb0f5b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1cb2234c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f5a27adce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1cb2239a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1cb223adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f1cfdcd3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f5a2cb23609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f5a2c8ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59dad64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f59dc03dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7f1d02d1a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f59dc042a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f59dc043dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f5a27adce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f5a2cb23609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f5a2c8ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f59dad64897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f59dbcc7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f5a27adce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f5a2cb23609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f1d02ae5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:frame #4: clone + 0x43 (0x7f5a2c8ee353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]: -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1cb0f5b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f1cb1ebe119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f1cfdcd3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f1d02d1a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f1d02ae5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c25ff5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c272cec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c272d3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c272d4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3c72d6de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3c77db4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3c77b7f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c25ff5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c272cec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c272d3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c272d4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3c72d6de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3c77db4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3c77b7f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c25ff5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f3c26f58119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f3c72d6de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f3c77db4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f3c77b7f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank35]: Traceback (most recent call last): -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: pipeline_state.run_communication() -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: pipeline_state.run_communication() -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ea1cc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0ea2f9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0ea2fa4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0ea2fa5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0eeea3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0ef3a85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0ef3850353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ea1cc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0ea2f9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0ea2fa4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0ea2fa5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f0eeea3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0ef3a85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f0ef3850353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ea1cc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f0ea2c29119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f0eeea3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f0ef3a85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f0ef3850353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f159b34e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f159c627c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f159c62ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f159c62ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f15e80c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f15ed10d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f15eced8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f159b34e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f159c627c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f159c62ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f159c62ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f15e80c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f15ed10d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f15eced8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f159b34e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f159c2b1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f15e80c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f15ed10d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f15eced8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6b5154897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6b642dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6b6432a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6b6433dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa701ecce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa706f13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa706cde353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6b5154897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6b642dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6b6432a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6b6433dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa701ecce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa706f13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa706cde353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6b5154897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fa6b60b7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fa701ecce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fa706f13609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fa706cde353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8a3804c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8a39325c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8a3932aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8a3932bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f8a84dc4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f8a89e0b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8a89bd6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8a3804c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8a39325c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8a3932aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8a3932bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f8a84dc4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f8a89e0b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8a89bd6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8a3804c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f8a38faf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f8a84dc4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f8a89e0b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f8a89bd6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8b2ab7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8b3d90c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8b3d95a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8b3d96dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa8ff82fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa904876609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa904641353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8b2ab7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8b3d90c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8b3d95a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8b3d96dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fa8ff82fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fa904876609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fa904641353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8b2ab7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fa8b3a1a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fa8ff82fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fa904876609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fa904641353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd23e77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd25150c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd25155a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd25156dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdd70befe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdd75c36609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdd75a01353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd23e77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd25150c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd25155a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd25156dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdd70befe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdd75c36609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdd75a01353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd23e77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fdd24dda119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fdd70befe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fdd75c36609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fdd75a01353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faf68dfc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faf6a0d5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faf6a0daa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faf6a0dbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fafb5b74e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fafbabbb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fafba986353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faf68dfc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faf6a0d5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faf6a0daa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faf6a0dbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fafb5b74e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fafbabbb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fafba986353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faf68dfc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7faf69d5f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fafb5b74e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fafbabbb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fafba986353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: Traceback (most recent call last): -[default6]:[rank6]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: trainer.train(dataloader) -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: result = loss.backward() -[default6]:[rank6]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: torch.autograd.backward( -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: _engine_run_backward( -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05f21b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05f348dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05f3492a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05f3493dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f063ef2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0643f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:frame #6: clone + 0x43 (0x7f0643d3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05f21b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05f348dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05f3492a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05f3493dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8943f7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #4: + 0xd3e95 (0x7f063ef2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0643f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd8956d0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd8956d5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #6: clone + 0x43 (0x7f0643d3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:[rank1]: return func(*args, **kwargs) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd8956d6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05f21b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f05f3117119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:frame #2: + 0xd3e95 (0x7f063ef2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #4: + 0xd3e95 (0x7fd8e116fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd8e61b6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #3: + 0x8609 (0x7f0643f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:frame #6: clone + 0x43 (0x7fd8e5f81353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #4: clone + 0x43 (0x7f0643d3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8943f7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd8956d0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd8956d5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd8956d6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd8e116fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd8e61b6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd8e5f81353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8943f7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd89535a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd8e116fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd8e61b6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd8e5f81353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f28fc8fc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcf3075f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f28fdbd5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcf31a38c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f28fdbdaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcf31a3da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f28fdbdbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2949674e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcf31a3edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:frame #5: + 0x8609 (0x7f294e6bb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: + 0xd3e95 (0x7fcf7d4d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default3]:frame #5: + 0x8609 (0x7fcf8251e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:frame #6: clone + 0x43 (0x7f294e486353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #6: clone + 0x43 (0x7fcf822e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]: -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66b2ec1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f66b419ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f66b419fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f28fc8fc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f28fdbd5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcf3075f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcf31a38c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcf31a3da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f28fdbdaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f66b41a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcf31a3edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f28fdbdbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f66ffc39e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #4: + 0xd3e95 (0x7f2949674e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #4: + 0xd3e95 (0x7fcf7d4d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f6704c80609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #5: + 0x8609 (0x7f294e6bb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #5: + 0x8609 (0x7fcf8251e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:frame #6: clone + 0x43 (0x7f294e486353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:frame #6: clone + 0x43 (0x7f6704a4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]: -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:frame #6: clone + 0x43 (0x7fcf822e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f28fc8fc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f28fd85f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: -[default5]:[rank53]: dist.recv( -[default6]:frame #2: + 0xd3e95 (0x7f2949674e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f294e6bb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcf3075f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fcf316c2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default3]:frame #2: + 0xd3e95 (0x7fcf7d4d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank53]: return func(*args, **kwargs) -[default6]:frame #4: clone + 0x43 (0x7f294e486353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:frame #3: + 0x8609 (0x7fcf8251e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]: -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66b2ec1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #4: clone + 0x43 (0x7fcf822e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f66b419ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f66b419fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f66b41a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:frame #4: + 0xd3e95 (0x7f66ffc39e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f6704c80609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f6704a4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66b2ec1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f66b3e24119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f66ffc39e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f6704c80609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f6704a4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6df01d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6df14adc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6df14b2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6df14b3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f6e3cf4ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f6e41f93609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f6e41d5e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6df01d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6df14adc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6df14b2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6df14b3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f6e3cf4ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f6e41f93609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f6e41d5e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6df01d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f6df1137119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f6e3cf4ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f6e41f93609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f6e41d5e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05d8372897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05d964bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05d9650a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05d9651dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f06250eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f062a131609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f0629efc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05d8372897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f05d964bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f05d9650a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f05d9651dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f06250eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f062a131609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f0629efc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05d8372897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f05d92d5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f06250eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f062a131609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f0629efc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16521e9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f16534c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f16534c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f16534c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f169ef61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f16a3fa8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f16a3d73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16521e9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f16534c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f16534c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f16534c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f169ef61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f16a3fa8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f16a3d73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16521e9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f165314c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f169ef61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f16a3fa8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f16a3d73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8ad3f13897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8ad51ecc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8ad51f1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8ad51f2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f8b20c8be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f8b25cd2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f8b25a9d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8ad3f13897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8ad51ecc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8ad51f1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8ad51f2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f8b20c8be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f8b25cd2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f8b25a9d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8ad3f13897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f8ad4e76119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f8b20c8be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f8b25cd2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f8b25a9d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0389bd2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f038aeabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f038aeb0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f038aeb1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f03d694ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f03db991609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f03db75c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0389bd2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f038aeabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f038aeb0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f038aeb1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f03d694ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f03db991609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f03db75c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0389bd2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f038ab35119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f03d694ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f03db991609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f03db75c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank21]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return func(*args, **kwargs) -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ef089c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3ef1b75c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3ef1b7aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3ef1b7bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3f3d614e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3f4265b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3f42426353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41a3a92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ef089c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3ef1b75c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3ef1b7aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3ef1b7bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3f3d614e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3f4265b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3f42426353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f41a4d6bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ef089c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f3ef17ff119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f3f3d614e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f3f4265b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f3f42426353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f41a4d70a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f41a4d71dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f41f080ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f41f5851609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f41f561c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41a3a92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f41a4d6bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f41a4d70a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f41a4d71dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f41f080ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f41f5851609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f41f561c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41a3a92897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f41a49f5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f41f080ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f41f5851609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f41f561c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff228711897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff2299eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff2299efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff2299f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff275489e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff27a4d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff27a29b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff228711897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff2299eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff2299efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff2299f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff275489e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff27a4d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff27a29b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff228711897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7ff229674119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7ff275489e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7ff27a4d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7ff27a29b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc73e87b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc73fb54c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc73fb59a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc73fb5adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc78b5f3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc79063a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc790405353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc73e87b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc73fb54c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc73fb59a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc73fb5adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc78b5f3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc79063a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc790405353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc73e87b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fc73f7de119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fc78b5f3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fc79063a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fc790405353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0c6e1e2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0c6f4bbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0c6f4c0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0c6f4c1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0cbaf5ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0cbffa1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0cbfd6c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b88512897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b897ebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b897f0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b897f1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2bd528ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2bda2d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0c6e1e2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0c6f4bbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0c6f4c0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0c6f4c1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #6: clone + 0x43 (0x7f2bda09c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]:frame #4: + 0xd3e95 (0x7f0cbaf5ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0cbffa1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0cbfd6c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0c6e1e2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f0c6f145119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f0cbaf5ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f0cbffa1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f0cbfd6c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b88512897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2b897ebc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2b897f0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2b897f1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f2bd528ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2bda2d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2bda09c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b88512897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f2b89475119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f2bd528ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f2bda2d1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f2bda09c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff5069c0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff507c99c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff507c9ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff507c9fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff553738e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff55877f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff55854a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff5069c0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff507c99c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff507c9ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff507c9fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff553738e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff55877f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff55854a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff5069c0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7ff507923119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7ff553738e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7ff55877f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7ff55854a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4d16d52897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4d1802bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4d18030a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4d18031dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f4d63acae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f4d68b11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f4d688dc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4d16d52897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4d1802bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4d18030a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4d18031dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f4d63acae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f4d68b11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f4d688dc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4d16d52897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f4d17cb5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f4d63acae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f4d68b11609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f4d688dc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7face9fc9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faceb2a2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faceb2a7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faceb2a8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fad36d41e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fad3bd88609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fad3bb53353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7face9fc9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faceb2a2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faceb2a7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faceb2a8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fad36d41e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fad3bd88609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fad3bb53353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7face9fc9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7faceaf2c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fad36d41e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fad3bd88609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fad3bb53353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbb9410897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efbba6e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efbba6eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efbba6efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7efc06188e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7efc0b1cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7efc0af9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbb9410897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efbba6e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efbba6eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efbba6efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7efc06188e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7efc0b1cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7efc0af9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbb9410897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7efbba373119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7efc06188e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7efc0b1cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7efc0af9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8f36ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd8f49c7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd8f49cca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd8f49cddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd940466e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd9454ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd945278353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8f36ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd8f49c7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd8f49cca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd8f49cddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd940466e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd9454ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd945278353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8f36ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fd8f4651119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fd940466e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fd9454ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fd945278353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc512a9f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc513d78c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc513d7da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc513d7edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fc55f817e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fc56485e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fc564629353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc512a9f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc513d78c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc513d7da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc513d7edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fc55f817e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fc56485e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fc564629353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc512a9f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fc513a02119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fc55f817e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fc56485e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fc564629353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc3b9a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdc3cc7ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdc3cc83a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdc3cc84dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fdc8871de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fdc8d764609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fdc8d52f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc3b9a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdc3cc7ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdc3cc83a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdc3cc84dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fdc8871de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fdc8d764609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fdc8d52f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdc3b9a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fdc3c908119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fdc8871de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fdc8d764609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fdc8d52f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94e280d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94e3ae6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94e3aeba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94e3aecdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f952f585e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f95345cc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f9534397353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94e280d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f94e3ae6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f94e3aeba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f94e3aecdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f952f585e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f95345cc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f9534397353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f94e280d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f94e3770119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f952f585e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f95345cc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f9534397353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcbbe483897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcbbf75cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcbbf761a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcbbf762dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fcc0b1fbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fcc10242609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fcc1000d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcbbe483897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcbbf75cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcbbf761a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcbbf762dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fcc0b1fbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fcc10242609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fcc1000d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcbbe483897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fcbbf3e6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fcc0b1fbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fcc10242609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fcc1000d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e2f7ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3e30ac7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3e30acca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3e30acddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3e7c566e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3e815ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3e81378353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e2f7ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3e30ac7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3e30acca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3e30acddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3e7c566e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3e815ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3e81378353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e2f7ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f3e30751119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f3e7c566e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f3e815ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f3e81378353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47fb493897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47fc76cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47fc771a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47fc772dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f484820be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f484d252609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f484d01d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47fb493897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47fc76cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47fc771a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47fc772dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f484820be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f484d252609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f484d01d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47fb493897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f47fc3f6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f484820be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f484d252609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f484d01d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f51a953b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f51aa814c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f51aa819a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f51aa81adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f51f62b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f51fb2fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f51fb0c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f51a953b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f51aa814c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f51aa819a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f51aa81adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f51f62b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f51fb2fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f51fb0c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f51a953b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f51aa49e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f51f62b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f51fb2fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f51fb0c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efe0f057897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efe10330c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efe10335a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efe10336dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7efe5bdcfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7efe60e16609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7efe60be1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efe0f057897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efe10330c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efe10335a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efe10336dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7efe5bdcfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7efe60e16609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7efe60be1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efe0f057897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7efe0ffba119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7efe5bdcfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7efe60e16609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7efe60be1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e35d3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8e37017c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8e3701ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8e3701ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f8e82ab6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f8e87afd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f8e878c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e35d3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8e37017c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8e3701ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8e3701ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f8e82ab6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f8e87afd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f8e878c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e35d3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f8e36ca1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f8e82ab6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f8e87afd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f8e878c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f021d55b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f021e834c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f021e839a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f021e83adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f026a2d3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f026f31a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f026f0e5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f021d55b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f021e834c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f021e839a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f021e83adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f026a2d3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f026f31a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f026f0e5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f021d55b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f021e4be119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f026a2d3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f026f31a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f026f0e5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76fe4db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f76ff7b4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f76ff7b9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f76ff7badcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f774b253e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f775029a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7750065353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76fe4db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f76ff7b4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f76ff7b9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f76ff7badcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f774b253e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f775029a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7750065353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76fe4db897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f76ff43e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f774b253e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f775029a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f7750065353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc0227ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc023a84c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc023a89a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc023a8adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fc06f523e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fc07456a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fc074335353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc0227ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc023a84c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc023a89a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc023a8adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fc06f523e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fc07456a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fc074335353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc0227ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fc02370e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fc06f523e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fc07456a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fc074335353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ffe3c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0fff6a1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0fff6a6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0fff6a7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f104b140e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1050187609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f104ff52353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ffe3c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0fff6a1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0fff6a6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0fff6a7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f104b140e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1050187609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f104ff52353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0ffe3c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f0fff32b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f104b140e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f1050187609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f104ff52353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f75b8180897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f75b9459c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f75b945ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f75b945fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7604ef8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f7609f3f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f7609d0a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f75b8180897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f75b9459c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f75b945ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f75b945fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7604ef8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f7609f3f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f7609d0a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f75b8180897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f75b90e3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f7604ef8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f7609f3f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f7609d0a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd04b5d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd04c8b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd04c8b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd04c8b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fd098351e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fd09d398609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fd09d163353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd04b5d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd04c8b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd04c8b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd04c8b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fd098351e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fd09d398609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fd09d163353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd04b5d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fd04c53c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fd098351e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fd09d398609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fd09d163353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa0b3ccd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa0b4fa6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa0b4faba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa0b4facdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa100a45e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa105a8c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa105857353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa0b3ccd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa0b4fa6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa0b4faba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa0b4facdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa100a45e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa105a8c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa105857353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa0b3ccd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fa0b4c30119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fa100a45e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fa105a8c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fa105857353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffa2ba18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffa2ccf1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffa2ccf6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffa2ccf7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ffa78790e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ffa7d7d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ffa7d5a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffa2ba18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffa2ccf1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffa2ccf6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffa2ccf7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ffa78790e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ffa7d7d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ffa7d5a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffa2ba18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7ffa2c97b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7ffa78790e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7ffa7d7d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7ffa7d5a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f51019e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5102cc1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5102cc6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5102cc7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f514e760e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f51537a7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5153572353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f51019e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5102cc1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5102cc6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5102cc7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f514e760e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f51537a7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5153572353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f51019e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f510294b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f514e760e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f51537a7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f5153572353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f451e8d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f451fbabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f451fbb0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f451fbb1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f456b64ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4570691609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f457045c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f451e8d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f451fbabc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f451fbb0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f451fbb1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f456b64ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4570691609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f457045c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f451e8d2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f451f835119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f456b64ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f4570691609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f457045c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba3e2bd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fba3f596c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fba3f59ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fba3f59cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fba8b035e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fba9007c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fba8fe47353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba3e2bd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fba3f596c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fba3f59ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fba3f59cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fba8b035e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fba9007c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fba8fe47353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba3e2bd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fba3f220119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fba8b035e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fba9007c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fba8fe47353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a0a8fa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3a0bbd3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3a0bbd8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3a0bbd9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3a57672e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3a5c6b9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3a5c484353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a0a8fa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3a0bbd3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3a0bbd8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3a0bbd9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3a57672e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3a5c6b9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3a5c484353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a0a8fa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f3a0b85d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f3a57672e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f3a5c6b9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f3a5c484353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f401bc1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f401cef3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f401cef8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f401cef9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4068992e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f406d9d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f406d7a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f401bc1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f401cef3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f401cef8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f401cef9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4068992e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f406d9d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f406d7a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f401bc1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f401cb7d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f4068992e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f406d9d9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f406d7a4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f64eaf69897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f64ec242c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f64ec247a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f64ec248dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6537ce1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f653cd28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f653caf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f64eaf69897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f64ec242c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f64ec247a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f64ec248dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6537ce1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f653cd28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f653caf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f64eaf69897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f64ebecc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f6537ce1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f653cd28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f653caf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcfeb08c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcfec365c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcfec36aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcfec36bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fd037e04e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fd03ce4b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fd03cc16353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcfeb08c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcfec365c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcfec36aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcfec36bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fd037e04e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fd03ce4b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fd03cc16353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcfeb08c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fcfebfef119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fd037e04e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fd03ce4b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fd03cc16353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f78f8a79897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f78f9d52c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f78f9d57a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f78f9d58dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f79457f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f794a838609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f794a603353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f78f8a79897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f78f9d52c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f78f9d57a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f78f9d58dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f79457f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f794a838609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f794a603353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f78f8a79897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f78f99dc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f79457f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f794a838609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f794a603353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15795c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f157a8a2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f157a8a7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f157a8a8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f15c6341e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f15cb388609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f15cb153353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15795c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f157a8a2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f157a8a7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f157a8a8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f15c6341e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f15cb388609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f15cb153353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f15795c9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f157a52c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f15c6341e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f15cb388609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f15cb153353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe27fe2d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe281106c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe28110ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe28110cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fe2ccba5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fe2d1bec609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fe2d19b7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe27fe2d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe281106c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe28110ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe28110cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fe2ccba5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fe2d1bec609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fe2d19b7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe27fe2d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fe280d90119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fe2ccba5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fe2d1bec609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fe2d19b7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3125d84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f312705dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3127062a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3127063dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3172afce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3177b43609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f317790e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=524288, NumelOut=524288, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3125d84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f312705dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3127062a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3127063dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3172afce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3177b43609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f317790e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3125d84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f3126ce7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f3172afce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f3177b43609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f317790e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -W0703 00:12:39.226000 140698582562624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871756 closing signal SIGTERM -W0703 00:12:39.226000 140698582562624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871758 closing signal SIGTERM -W0703 00:12:39.226000 140698582562624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871763 closing signal SIGTERM -W0703 00:12:39.227000 140698582562624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871764 closing signal SIGTERM -W0703 00:12:39.257000 139789579990848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1106260 closing signal SIGTERM -W0703 00:12:39.258000 139789579990848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1106261 closing signal SIGTERM -W0703 00:12:39.258000 139789579990848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1106262 closing signal SIGTERM -W0703 00:12:39.258000 139789579990848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1106263 closing signal SIGTERM -W0703 00:12:39.258000 139789579990848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1106266 closing signal SIGTERM -W0703 00:12:39.258000 139789579990848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1106267 closing signal SIGTERM -E0703 00:12:40.241000 140698582562624 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 871757) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:12:39 - host : ip-26-0-172-73.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 871759) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 871759 -[2]: - time : 2024-07-03_00:12:39 - host : ip-26-0-172-73.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 871760) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 871760 -[3]: - time : 2024-07-03_00:12:39 - host : ip-26-0-172-73.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 871762) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 871762 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:12:39 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 871757) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 871757 -============================================================ -E0703 00:12:40.889000 139789579990848 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 4 (pid: 1106264) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:12:39 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1106265) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1106265 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:12:39 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1106264) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1106264 -============================================================ -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -W0703 00:12:43.012000 139759841998592 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3189970_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:43.905000 140085835507456 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_866606_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:43.994000 140138618722048 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-178.ec2.internal_493638_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.003000 139976293373696 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1029976_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.007000 139696962139904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1830290_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.095000 139757651707648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1802653_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.229000 139702622873408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1830360 closing signal SIGTERM -W0703 00:12:44.230000 139702622873408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1830363 closing signal SIGTERM -W0703 00:12:44.230000 139702622873408 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1830367 closing signal SIGTERM -W0703 00:12:44.314000 139765502732096 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3190042 closing signal SIGTERM -E0703 00:12:44.448000 140144279455552 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 493710) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 00:12:44.455000 139981954107200 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1030045) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 00:12:44.457000 140091496240960 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 866675) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:12:44.460000 140144279455552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_493638_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.467000 139981954107200 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1029976_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.470000 140091496240960 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_866606_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 00:12:44.486000 139763312441152 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1802723) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:12:44.495000 140144279455552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_493638_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.498000 140091496240960 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_866606_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.498000 139981954107200 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1029976_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.499000 139763312441152 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1802653_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.526000 140091496240960 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_866606_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -W0703 00:12:44.528000 139763312441152 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1802653_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -W0703 00:12:44.528000 140144279455552 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_493638_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:12:44 - host : ip-26-0-165-24.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 866676) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 866676 -[2]: - time : 2024-07-03_00:12:44 - host : ip-26-0-165-24.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 866677) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 866677 -[3]: - time : 2024-07-03_00:12:44 - host : ip-26-0-165-24.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 866678) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 866678 -[4]: - time : 2024-07-03_00:12:44 - host : ip-26-0-165-24.ec2.internal - rank : 28 (local_rank: 4) - exitcode : -6 (pid: 866679) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 866679 -[5]: - time : 2024-07-03_00:12:44 - host : ip-26-0-165-24.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 866680) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 866680 -[6]: - time : 2024-07-03_00:12:44 - host : ip-26-0-165-24.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 866681) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 866681 -[7]: - time : 2024-07-03_00:12:44 - host : ip-26-0-165-24.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 866682) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 866682 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:12:44 - host : ip-26-0-165-24.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 866675) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 866675 -============================================================ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 493711) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 493711 -[2]: - time : 2024-07-03_00:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 493712) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 493712 -[3]: - time : 2024-07-03_00:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 493713) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 493713 -[4]: - time : 2024-07-03_00:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 493714) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 493714 -[5]: - time : 2024-07-03_00:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 493715) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 493715 -[6]: - time : 2024-07-03_00:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 493716) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 493716 -[7]: - time : 2024-07-03_00:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 493717) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 493717 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:12:44 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 493710) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 493710 -============================================================ -W0703 00:12:44.535000 139981954107200 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1029976_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:12:44 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : -6 (pid: 1030046) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1030046 -[2]: - time : 2024-07-03_00:12:44 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : -6 (pid: 1030047) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1030047 -[3]: - time : 2024-07-03_00:12:44 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : -6 (pid: 1030048) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1030048 -[4]: - time : 2024-07-03_00:12:44 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 1030049) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1030049 -[5]: - time : 2024-07-03_00:12:44 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : -6 (pid: 1030050) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1030050 -[6]: - time : 2024-07-03_00:12:44 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 1030051) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1030051 -[7]: - time : 2024-07-03_00:12:44 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : -6 (pid: 1030052) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1030052 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:12:44 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 1030045) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1030045 -============================================================ -W0703 00:12:44.555000 139763312441152 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1802653_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:12:44 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 1802724) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1802724 -[2]: - time : 2024-07-03_00:12:44 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 1802725) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1802725 -[3]: - time : 2024-07-03_00:12:44 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 1802726) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1802726 -[4]: - time : 2024-07-03_00:12:44 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 1802727) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1802727 -[5]: - time : 2024-07-03_00:12:44 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 1802728) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1802728 -[6]: - time : 2024-07-03_00:12:44 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 1802729) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1802729 -[7]: - time : 2024-07-03_00:12:44 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 1802730) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1802730 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:12:44 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 1802723) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1802723 -============================================================ -E0703 00:12:44.565000 139765502732096 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3190039) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:12:44.577000 139765502732096 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3189970_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.604000 139765502732096 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3189970_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:44.629000 139765502732096 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3189970_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:12:44 - host : ip-26-0-163-226.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 3190040) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3190040 -[2]: - time : 2024-07-03_00:12:44 - host : ip-26-0-163-226.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 3190041) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3190041 -[3]: - time : 2024-07-03_00:12:44 - host : ip-26-0-163-226.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 3190043) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3190043 -[4]: - time : 2024-07-03_00:12:44 - host : ip-26-0-163-226.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 3190044) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3190044 -[5]: - time : 2024-07-03_00:12:44 - host : ip-26-0-163-226.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 3190045) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3190045 -[6]: - time : 2024-07-03_00:12:44 - host : ip-26-0-163-226.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 3190046) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3190046 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:12:44 - host : ip-26-0-163-226.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 3190039) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3190039 -============================================================ -E0703 00:12:45.164000 139702622873408 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1830359) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:12:45.177000 139702622873408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1830290_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:12:45.209000 139702622873408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1830290_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -W0703 00:12:45.228000 139702622873408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1830290_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:12:44 - host : ip-26-0-168-238.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 1830362) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1830362 -[2]: - time : 2024-07-03_00:12:44 - host : ip-26-0-168-238.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 1830364) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1830364 -[3]: - time : 2024-07-03_00:12:44 - host : ip-26-0-168-238.ec2.internal - rank : 37 (local_rank: 5) - exitcode : -6 (pid: 1830365) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1830365 -[4]: - time : 2024-07-03_00:12:44 - host : ip-26-0-168-238.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 1830366) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1830366 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:12:44 - host : ip-26-0-168-238.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 1830359) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1830359 -============================================================ -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-1/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/bench.slurm deleted file mode 100644 index 33aed588cf2c0a93d0b874f72d887430f0cb0d6c..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/config.yaml deleted file mode 100644 index f5a15d5c119a93581921f188ca5ca8fbb9b17e26..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 4 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 128 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/log.out deleted file mode 100644 index 1b766c0c015a5e2b33f545d02c5bfbca61b1486b..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/log.out +++ /dev/null @@ -1,5759 +0,0 @@ -======================== -START TIME: Wed Jul 3 00:12:51 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 00:12:56.920000 140445265241920 torch/distributed/run.py:757] -W0703 00:12:56.920000 140445265241920 torch/distributed/run.py:757] ***************************************** -W0703 00:12:56.920000 140445265241920 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:12:56.920000 140445265241920 torch/distributed/run.py:757] ***************************************** -W0703 00:12:56.921000 139985663354688 torch/distributed/run.py:757] -W0703 00:12:56.921000 139985663354688 torch/distributed/run.py:757] ***************************************** -W0703 00:12:56.921000 139985663354688 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:12:56.921000 139985663354688 torch/distributed/run.py:757] ***************************************** -W0703 00:12:56.921000 139905265825600 torch/distributed/run.py:757] -W0703 00:12:56.921000 139905265825600 torch/distributed/run.py:757] ***************************************** -W0703 00:12:56.921000 139905265825600 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:12:56.921000 139905265825600 torch/distributed/run.py:757] ***************************************** -W0703 00:12:56.922000 140507061364544 torch/distributed/run.py:757] -W0703 00:12:56.922000 140507061364544 torch/distributed/run.py:757] ***************************************** -W0703 00:12:56.922000 140507061364544 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:12:56.922000 140507061364544 torch/distributed/run.py:757] ***************************************** -W0703 00:12:57.013000 140629515450176 torch/distributed/run.py:757] -W0703 00:12:57.013000 140629515450176 torch/distributed/run.py:757] ***************************************** -W0703 00:12:57.013000 140629515450176 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:12:57.013000 140629515450176 torch/distributed/run.py:757] ***************************************** -W0703 00:12:57.051000 139849737688896 torch/distributed/run.py:757] -W0703 00:12:57.051000 139849737688896 torch/distributed/run.py:757] ***************************************** -W0703 00:12:57.051000 139849737688896 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:12:57.051000 139849737688896 torch/distributed/run.py:757] ***************************************** -W0703 00:12:57.259000 139716723873600 torch/distributed/run.py:757] -W0703 00:12:57.259000 139716723873600 torch/distributed/run.py:757] ***************************************** -W0703 00:12:57.259000 139716723873600 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:12:57.259000 139716723873600 torch/distributed/run.py:757] ***************************************** -W0703 00:12:57.264000 140671516706624 torch/distributed/run.py:757] -W0703 00:12:57.264000 140671516706624 torch/distributed/run.py:757] ***************************************** -W0703 00:12:57.264000 140671516706624 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:12:57.264000 140671516706624 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 00:13:23 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=2, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=16, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=128, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=4, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128')), -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 00:13:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 00:13:41 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=9|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=13|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=12|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=11|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=15|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=14|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=10|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=6|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=8|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=3|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=7|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=0|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=4|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=2|ip-26-0-172-57]: No checkpoint path provided. -[default5]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=5|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 00:13:41 [INFO|DP=1|PP=1|TP=1|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=1|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=2|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=4|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=5|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=3|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=7|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 00:13:41 [INFO|DP=1|PP=0|TP=6|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 00:13:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 00:13:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 00:13:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 00:13:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 00:13:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 00:13:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 00:13:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 00:13:45 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:13:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 00:13:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 00:13:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 00:13:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 00:13:51.450590 | mbs: 128 | grad_accum: 4 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 00:13:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 00:13:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default1]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=15|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=3|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=2|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=4|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=7|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=3|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=4|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=5|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=13|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=12|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=14|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=10|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:13:52 [WARNING|DP=1|PP=1|TP=6|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=2|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=7|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:13:53 [WARNING|DP=0|PP=1|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:13:53 [WARNING|DP=0|PP=1|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:13:53 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:13:52 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:13:53 [WARNING|DP=1|PP=1|TP=0|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:13:52 [WARNING|DP=1|PP=0|TP=1|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:13:52 [WARNING|DP=0|PP=1|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:13:53 [WARNING|DP=0|PP=1|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:13:53 [WARNING|DP=1|PP=1|TP=9|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:13:53 [WARNING|DP=1|PP=1|TP=11|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:13:53 [WARNING|DP=1|PP=1|TP=8|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:13:53 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:13:53 [WARNING|DP=1|PP=1|TP=1|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:13:53 [WARNING|DP=0|PP=1|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:13:53 [WARNING|DP=0|PP=1|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:13:53 [WARNING|DP=1|PP=0|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:13:53 [WARNING|DP=1|PP=1|TP=5|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:13:53 [WARNING|DP=1|PP=0|TP=6|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank31]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank31]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.82 GiB is free. Including non-PyTorch memory, this process has 77.50 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank23]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank23]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.77 GiB is free. Including non-PyTorch memory, this process has 77.54 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: Traceback (most recent call last): -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: trainer.train(dataloader) -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in [default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -_call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank16]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: return row_linear( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: output = model(**micro_batch) -[default0]:[rank24]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default0]:[rank24]: sharded_logits = self.model( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank28]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return row_linear( -[default3]:[rank27]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return row_linear( -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.66 GiB is free. Including non-PyTorch memory, this process has 77.66 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.82 GiB is free. Including non-PyTorch memory, this process has 77.50 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: output = model(**micro_batch) -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: trainer.train(dataloader) -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: Traceback (most recent call last): -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: output = model(**micro_batch) -[default1]:[rank25]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default1]:[rank17]: output = model(**micro_batch) -[default2]:[rank26]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank25]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default6]:[rank30]: return row_linear( -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.82 GiB is free. Including non-PyTorch memory, this process has 77.50 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.86 GiB is free. Including non-PyTorch memory, this process has 77.46 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank17]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default2]:[rank26]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.77 GiB is free. Including non-PyTorch memory, this process has 77.54 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.66 GiB is free. Including non-PyTorch memory, this process has 77.66 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -n/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: trainer.train(dataloader) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank20]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank20]: File "/fsx/ferdinandmom/mini[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -forge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank20]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541[default2]:[rank26]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.86 GiB is free. Including non-PyTorch memory, this process has 77.46 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.66 GiB is free. Including non-PyTorch memory, this process has 77.66 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank29]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank29]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return row_linear( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.82 GiB is free. Including non-PyTorch memory, this process has 77.50 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: Traceback (most recent call last): -[default2]:[rank18]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: output = model(**micro_batch) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: sharded_logits = self.model( -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default2]:[rank18]: sharded_logits = self.model( -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: sharded_logits = self.model( -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank21]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank21]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return row_linear( -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.77 GiB is free. Including non-PyTorch memory, this process has 77.54 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default2]:[rank18]: return row_linear( -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.77 GiB is free. Including non-PyTorch memory, this process has 77.54 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.86 GiB is free. Including non-PyTorch memory, this process has 77.46 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank9]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank9]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.86 GiB is free. Including non-PyTorch memory, this process has 77.46 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank15]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank15]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.86 GiB is free. Including non-PyTorch memory, this process has 77.46 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank8]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank8]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank13]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank13]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.86 GiB is free. Including non-PyTorch memory, this process has 77.46 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: output = model(**micro_batch) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank12]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank12]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank14]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank14]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.77 GiB is free. Including non-PyTorch memory, this process has 77.54 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.77 GiB is free. Including non-PyTorch memory, this process has 77.54 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank11]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default2]:[rank10]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.86 GiB is free. Including non-PyTorch memory, this process has 77.46 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank10]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.77 GiB is free. Including non-PyTorch memory, this process has 77.54 GiB memory in use. Of the allocated memory 68.05 GiB is allocated by PyTorch, and 79.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -W0703 00:14:19.268000 139905265825600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3192000 closing signal SIGTERM -W0703 00:14:19.268000 139905265825600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3192003 closing signal SIGTERM -W0703 00:14:19.268000 139905265825600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3192004 closing signal SIGTERM -W0703 00:14:19.268000 139905265825600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3192005 closing signal SIGTERM -W0703 00:14:19.275000 140507061364544 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 495736 closing signal SIGTERM -W0703 00:14:19.275000 140507061364544 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 495737 closing signal SIGTERM -W0703 00:14:19.276000 140507061364544 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 495739 closing signal SIGTERM -W0703 00:14:19.276000 140507061364544 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 495740 closing signal SIGTERM -W0703 00:14:19.279000 140671516706624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 868707 closing signal SIGTERM -W0703 00:14:19.279000 140671516706624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 868708 closing signal SIGTERM -W0703 00:14:19.280000 140671516706624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 868709 closing signal SIGTERM -W0703 00:14:19.280000 140671516706624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 868711 closing signal SIGTERM -W0703 00:14:19.280000 140671516706624 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 868713 closing signal SIGTERM -E0703 00:14:20.398000 140507061364544 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 495735) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 00:14:20.404000 139905265825600 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3191999) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:14:19 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 495738) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:14:19 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 495741) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:14:19 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 495742) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:14:19 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 495735) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:14:19 - host : ip-26-0-163-226.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 3192001) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:14:19 - host : ip-26-0-163-226.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 3192002) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:14:19 - host : ip-26-0-163-226.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 3192006) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:14:19 - host : ip-26-0-163-226.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 3191999) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -E0703 00:14:20.806000 140671516706624 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 3 (pid: 868710) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:14:19 - host : ip-26-0-165-24.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 868712) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:14:19 - host : ip-26-0-165-24.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 868714) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:14:19 - host : ip-26-0-165-24.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 868710) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 4] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=5 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 4] ProcessGroupNCCL preparing to dump debug info. -[default4]:[rank4]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 4] [PG 2 Rank 4] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 5 -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 1] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=13 -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 1] ProcessGroupNCCL preparing to dump debug info. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 3] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=13 -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 3] ProcessGroupNCCL preparing to dump debug info. -[default1]:[rank1]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 1] [PG 2 Rank 1] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 13 -[default3]:[rank3]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 3] [PG 2 Rank 3] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 13 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 7] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=12 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 7] ProcessGroupNCCL preparing to dump debug info. -[default7]:[rank7]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 7] [PG 2 Rank 7] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 12 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 6] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=13 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 6] ProcessGroupNCCL preparing to dump debug info. -[default6]:[rank6]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 6] [PG 2 Rank 6] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 13 -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 2] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=12 -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 2] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank2]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 2] [PG 2 Rank 2] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 12 -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 0] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=5 -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 0] ProcessGroupNCCL preparing to dump debug info. -[default0]:[rank0]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 0] [PG 2 Rank 0] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 5 -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 5] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=5 -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 5] ProcessGroupNCCL preparing to dump debug info. -[default5]:[rank5]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 5] [PG 2 Rank 5] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 5 -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank46]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank46]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3a75bf9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank46]: frame #1: + 0x5b3a23e (0x7f3aaf71623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f3aaf710c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f3aaf710f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f3aaf711fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3aaf6c6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3aaf6c6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3aaf6c6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3aaf6c6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f3a76ed3189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f3a76eda610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f3a76ef9978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #12: + 0x5adc309 (0x7f3aaf6b8309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #13: + 0x5ae6f10 (0x7f3aaf6c2f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #14: + 0x5ae6fa5 (0x7f3aaf6c2fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #15: + 0x5124446 (0x7f3aaed00446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #16: + 0x1acf4b8 (0x7f3aab6ab4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #17: + 0x5aee004 (0x7f3aaf6ca004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #18: + 0x5af36b5 (0x7f3aaf6cf6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #19: + 0xd2631e (0x7f3ac22b931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #20: + 0x47def4 (0x7f3ac1a10ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #21: + 0x1445a6 (0x55c44d6395a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c44d632a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #23: + 0x150866 (0x55c44d645866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c44d62e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c44d639a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #26: PyObject_Call + 0xbc (0x55c44d645f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c44d62c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c44d639a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c44d62a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #30: + 0x150582 (0x55c44d645582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c44d62a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #32: + 0x150582 (0x55c44d645582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c44d62a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #34: + 0x150582 (0x55c44d645582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c44d62a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c44d631f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c44d643c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #38: + 0x211239 (0x55c44d706239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c44d632a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c44d62e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c44d639a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c44d629c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c44d639a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c44d62a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #45: + 0x150582 (0x55c44d645582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #46: PyObject_Call + 0xbc (0x55c44d645f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c44d62c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #48: + 0x150582 (0x55c44d645582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #49: PyObject_Call + 0xbc (0x55c44d645f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c44d62c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c44d639a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c44d632007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c44d643c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #54: + 0x211239 (0x55c44d706239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #55: PyObject_Call + 0x207 (0x55c44d646067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c44d62c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #57: + 0x150582 (0x55c44d645582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c44d62a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #59: + 0x150582 (0x55c44d645582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #60: PyObject_Call + 0xbc (0x55c44d645f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c44d62c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #62: + 0x150582 (0x55c44d645582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #63: PyObject_Call + 0xbc (0x55c44d645f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: Traceback (most recent call last): -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = model(**micro_batch) -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default5]:[rank53]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: pipeline_state.run_communication() -[default6]:[rank54]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: return func(*args, **kwargs) -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank54]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbda6f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank54]: frame #1: + 0x5b3a23e (0x7efc1421323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7efc1420dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc56c81e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: frame #1: + 0x5b3a23e (0x7fc5a633b23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7efc1420df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fc5a6335c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: dist.recv( -[default5]:[rank53]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fc5a6335f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fc5a6336fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7efc1420efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: return func(*args, **kwargs) -[default5]:[rank53]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc5a62eb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efc141c3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc5a62eb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc5a62eb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efc141c3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc5a62eb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank53]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fc56daf8189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank52]: dist.recv( -[default5]:[rank53]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fc56daff610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efc141c3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f81ab329897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fc56db1e978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efc141c3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #1: + 0x5b3a23e (0x7f81e4e4623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #12: + 0x5adc309 (0x7fc5a62dd309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7efbdb9d0189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7efbdb9d7610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: frame #13: + 0x5ae6f10 (0x7fc5a62e7f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7efbdb9f6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank54]: frame #12: + 0x5adc309 (0x7efc141b5309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #13: + 0x5ae6f10 (0x7efc141bff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank52]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd28ad33897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f81e4e40c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #14: + 0x5ae6fa5 (0x7fc5a62e7fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #14: + 0x5ae6fa5 (0x7efc141bffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #1: + 0x5b3a23e (0x7fd2c485023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f81e4e40f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #15: + 0x5124446 (0x7efc137fd446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd2c484ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #15: + 0x5124446 (0x7fc5a5925446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #16: + 0x1acf4b8 (0x7efc101a84b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #16: + 0x1acf4b8 (0x7fc5a22d04b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd2c484af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #17: + 0x5aee004 (0x7efc141c7004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #17: + 0x5aee004 (0x7fc5a62ef004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd2c484bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #18: + 0x5af36b5 (0x7efc141cc6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f81e4e41fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd2c4800371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #19: + 0xd2631e (0x7efc26db631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd2c4800371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd2c4800371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #18: + 0x5af36b5 (0x7fc5a62f46b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f81e4df6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f81e4df6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #20: + 0x47def4 (0x7efc2650def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #21: + 0x1445a6 (0x564d47a045a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #22: _PyObject_MakeTpCall + 0x26b (0x564d479fda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #19: + 0xd2631e (0x7fc5b8ede31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f81e4df6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd2c4800371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #23: + 0x150866 (0x564d47a10866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f81e4df6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #20: + 0x47def4 (0x7fc5b8635ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x564d479f9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #25: _PyFunction_Vectorcall + 0x6c (0x564d47a04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f81ac603189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #21: + 0x1445a6 (0x55877313e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #26: PyObject_Call + 0xbc (0x564d47a10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd28c00d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f81ac60a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd28c014610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x564d479f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f81ac629978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #22: _PyObject_MakeTpCall + 0x26b (0x558773137a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #28: _PyFunction_Vectorcall + 0x6c (0x564d47a04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd28c033978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #23: + 0x150866 (0x55877314a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x564d479f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #30: + 0x150582 (0x564d47a10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #12: + 0x5adc309 (0x7fd2c47f2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #12: + 0x5adc309 (0x7f81e4de8309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #13: + 0x5ae6f10 (0x7fd2c47fcf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x558773133142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55877313ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x564d479f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #26: PyObject_Call + 0xbc (0x55877314af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #32: + 0x150582 (0x564d47a10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5587731312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #13: + 0x5ae6f10 (0x7f81e4df2f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55877313ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x564d479f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55877312f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #14: + 0x5ae6fa5 (0x7fd2c47fcfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #14: + 0x5ae6fa5 (0x7f81e4df2fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #34: + 0x150582 (0x564d47a10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x564d479f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #15: + 0x5124446 (0x7fd2c3e3a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #15: + 0x5124446 (0x7f81e4430446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #30: + 0x150582 (0x55877314a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #16: + 0x1acf4b8 (0x7fd2c07e54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x564d479fcf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55877312f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #17: + 0x5aee004 (0x7fd2c4804004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #32: + 0x150582 (0x55877314a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55877312f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #16: + 0x1acf4b8 (0x7f81e0ddb4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #37: _PyObject_Call_Prepend + 0x69 (0x564d47a0ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #38: + 0x211239 (0x564d47ad1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #18: + 0x5af36b5 (0x7fd2c48096b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #17: + 0x5aee004 (0x7f81e4dfa004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #34: + 0x150582 (0x55877314a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55877312f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #39: _PyObject_MakeTpCall + 0x26b (0x564d479fda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x564d479f93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x558773136f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #18: + 0x5af36b5 (0x7f81e4dff6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #41: _PyFunction_Vectorcall + 0x6c (0x564d47a04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #19: + 0xd2631e (0x7f81f79e931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x564d479f4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #19: + 0xd2631e (0x7fd2d73f331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #20: + 0x47def4 (0x7f81f7140ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: frame #37: _PyObject_Call_Prepend + 0x69 (0x558773148c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #20: + 0x47def4 (0x7fd2d6b4aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #21: + 0x1445a6 (0x5563ae0f15a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5563ae0eaa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #43: _PyFunction_Vectorcall + 0x6c (0x564d47a04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #21: + 0x1445a6 (0x55737b75c5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55737b755a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #38: + 0x211239 (0x55877320b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #23: + 0x150866 (0x5563ae0fd866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #39: _PyObject_MakeTpCall + 0x26b (0x558773137a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x564d479f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5563ae0e6142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5587731333e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #45: + 0x150582 (0x564d47a10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #46: PyObject_Call + 0xbc (0x564d47a10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #23: + 0x150866 (0x55737b768866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55877313ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55877312ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5563ae0f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55737b751142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x564d479f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #48: + 0x150582 (0x564d47a10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55737b75ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #26: PyObject_Call + 0xbc (0x5563ae0fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55877313ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #26: PyObject_Call + 0xbc (0x55737b768f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5563ae0e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5563ae0f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55877312f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55737b74f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #49: PyObject_Call + 0xbc (0x564d47a10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5563ae0e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55737b75ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x564d479f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #45: + 0x150582 (0x55877314a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55737b74d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #51: _PyFunction_Vectorcall + 0x6c (0x564d47a04a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x564d479fd007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #46: PyObject_Call + 0xbc (0x55877314af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #53: _PyObject_Call_Prepend + 0x69 (0x564d47a0ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #30: + 0x150582 (0x55737b768582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55737b74d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #30: + 0x150582 (0x5563ae0fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #54: + 0x211239 (0x564d47ad1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5563ae0e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #55: PyObject_Call + 0x207 (0x564d47a11067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #32: + 0x150582 (0x55737b768582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55737b74d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5587731312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #32: + 0x150582 (0x5563ae0fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x564d479f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #48: + 0x150582 (0x55877314a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #49: PyObject_Call + 0xbc (0x55877314af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #57: + 0x150582 (0x564d47a10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5587731312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55877313ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #34: + 0x150582 (0x55737b768582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55737b74d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5563ae0e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x558773137007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x564d479f58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #53: _PyObject_Call_Prepend + 0x69 (0x558773148c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #34: + 0x150582 (0x5563ae0fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #59: + 0x150582 (0x564d47a10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #54: + 0x211239 (0x55877320b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5563ae0e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55737b754f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5563ae0e9f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #55: PyObject_Call + 0x207 (0x55877314b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55737b766c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #38: + 0x211239 (0x55737b829239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #60: PyObject_Call + 0xbc (0x564d47a10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5587731312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55737b755a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x564d479f72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55737b7513e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #62: + 0x150582 (0x564d47a10582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55737b75ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #63: PyObject_Call + 0xbc (0x564d47a10f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #57: + 0x150582 (0x55877314a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55737b74cc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5563ae0fbc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55877312f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55737b75ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #38: + 0x211239 (0x5563ae1be239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5563ae0eaa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55737b74d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5563ae0e63e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #59: + 0x150582 (0x55877314a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank55]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5563ae0f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #45: + 0x150582 (0x55737b768582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #60: PyObject_Call + 0xbc (0x55877314af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5563ae0e1c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #46: PyObject_Call + 0xbc (0x55737b768f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55737b74f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5587731312b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5563ae0f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #48: + 0x150582 (0x55737b768582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5563ae0e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #49: PyObject_Call + 0xbc (0x55737b768f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #45: + 0x150582 (0x5563ae0fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55737b74f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55737b75ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #62: + 0x150582 (0x55877314a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #46: PyObject_Call + 0xbc (0x5563ae0fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55737b755007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5563ae0e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #48: + 0x150582 (0x5563ae0fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55737b766c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #63: PyObject_Call + 0xbc (0x55877314af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #49: PyObject_Call + 0xbc (0x5563ae0fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #54: + 0x211239 (0x55737b829239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: frame #55: PyObject_Call + 0x207 (0x55737b769067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5563ae0e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55737b74f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #57: + 0x150582 (0x55737b768582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55737b74d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #59: + 0x150582 (0x55737b768582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #60: PyObject_Call + 0xbc (0x55737b768f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55737b74f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5563ae0f1a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5563ae0ea007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5563ae0fbc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #54: + 0x211239 (0x5563ae1be239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #55: PyObject_Call + 0x207 (0x5563ae0fe067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5563ae0e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #57: + 0x150582 (0x5563ae0fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #62: + 0x150582 (0x55737b768582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #63: PyObject_Call + 0xbc (0x55737b768f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5563ae0e28fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank55]: frame #59: + 0x150582 (0x5563ae0fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #60: PyObject_Call + 0xbc (0x5563ae0fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5563ae0e42b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #62: + 0x150582 (0x5563ae0fd582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #63: PyObject_Call + 0xbc (0x5563ae0fdf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default0]:[rank56]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank56]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default0]:[rank56]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdfd6744897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank56]: frame #1: + 0x5b3a23e (0x7fe01026123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fe01025bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fe01025bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in [default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -_call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.fro[default0]:[rank56]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fe01025cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -m_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank56]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe010211371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank38]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7fb409b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10[default0]:[rank56]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe010211371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe010211371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -/site-packages/torch/lib/libc10.so) -[default6]:[rank38]: frame #1: + 0x5b3a23e (0x7f7fedbb823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f7fedbb2c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7fedbb2f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7fedbb3fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7fedb68371 in /fsx/ferdinandmom[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7fedb68371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7fedb68371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7fedb68371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f7fb5375189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe010211371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f7fb537c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f7fb539b978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #12: + 0x5adc309 (0x7f7fedb5a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #13: + 0x5ae6f10 (0x7f7fedb64f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: frame #14: + 0x5ae6fa5 (0x7f7fedb64fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: frame #15: + 0x5124446 (0x7f7fed1a2446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: frame #16: + 0x1acf4b8 (0x7f7fe9b4d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #17: + 0x5aee004 (0x7f7fedb6c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #18: + 0x5af36b5 (0x7f7fedb716b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: frame #19: + 0xd2631e (0x7f800075b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #20: + 0x47def4 (0x7f7fffeb2ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank57]: dist.recv( -[default6]:[rank38]: frame #21: + 0x1445a6 (0x556897db35a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #22: _PyObject_MakeTpCall + 0x26b (0x556897daca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #23: + 0x150866 (0x556897dbf866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x556897da8142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #25: _PyFunction_Vectorcall + 0x6c (0x556897db3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #26: PyObject_Call + 0xbc (0x556897dbff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x556897da62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster[default0]:[rank56]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fdfd7a1e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fdfd7a25610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fdfd7a44978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -/bin/python3.10) -[default6]:[rank38]: frame #28: _PyFunction_Vectorcall + 0x6c (0x556897db3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x556897da48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #30: + 0x150582 (0x556897dbf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x556897da48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #32: + 0x150582 (0x556897dbf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x556897da48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: return func(*args, **kwargs) -[default6]:[rank38]: frame #34: + 0x150582 (0x556897dbf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x556897da48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x556897dabf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #37: _PyObject_Call_Prepend + 0x69 (0x556897dbdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: frame #12: + 0x5adc309 (0x7fe010203309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #38: + 0x211239 (0x556897e80239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #39: _PyObject_MakeTpCall + 0x26b (0x556897daca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #13: + 0x5ae6f10 (0x7fe01020df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x556897da83e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #41: _PyFunction_Vectorcall + 0x6c (0x556897db3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x556897da3c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #43: _PyFunction_Vectorcall + 0x6c (0x556897db3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x556897da48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: frame #45: + 0x150582 (0x556897dbf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #46: PyObject_Call + 0xbc (0x556897dbff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #14: + 0x5ae6fa5 (0x7fe01020dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x556897da62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #48: + 0x150582 (0x556897dbf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: frame #49: PyObject_Call + 0xbc (0x556897dbff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x556897da62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank38]: frame #51: _PyFunction_Vectorcall + 0x6c (0x556897db3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x556897dac007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #53: _PyObject_Call_Prepend + 0x69 (0x556897dbdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0255c22897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank38]: frame #54: + 0x211239 (0x556897e80239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #55: PyObject_Call + 0x207 (0x556897dc0067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #1: + 0x5b3a23e (0x7f028f73f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x556897da62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #57: + 0x150582 (0x556897dbf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f028f739c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x556897da48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #59: + 0x150582 (0x556897dbf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #60: PyObject_Call + 0xbc (0x556897dbff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #15: + 0x5124446 (0x7fe00f84b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #16: + 0x1acf4b8 (0x7fe00c1f64b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #17: + 0x5aee004 (0x7fe010215004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x556897da62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #62: + 0x150582 (0x556897dbf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #18: + 0x5af36b5 (0x7fe01021a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #63: PyObject_Call + 0xbc (0x556897dbff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank56]: frame #19: + 0xd2631e (0x7fe022e0431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #20: + 0x47def4 (0x7fe02255bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #21: + 0x1445a6 (0x55e95b6555a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f028f739f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f028f73afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f028f6ef371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f028f6ef371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f028f6ef371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f028f6ef371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f0256efc189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0256f03610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0256f22978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #12: + 0x5adc309 (0x7f028f6e1309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #13: + 0x5ae6f10 (0x7f028f6ebf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55e95b64ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #23: + 0x150866 (0x55e95b661866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55e95b64a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55e95b655a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #26: PyObject_Call + 0xbc (0x55e95b661f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #14: + 0x5ae6fa5 (0x7f028f6ebfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #15: + 0x5124446 (0x7f028ed29446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #16: + 0x1acf4b8 (0x7f028b6d44b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55e95b6482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55e95b655a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55e95b6468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #17: + 0x5aee004 (0x7f028f6f3004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #30: + 0x150582 (0x55e95b661582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55e95b6468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #32: + 0x150582 (0x55e95b661582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55e95b6468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #34: + 0x150582 (0x55e95b661582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55e95b6468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55e95b64df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #18: + 0x5af36b5 (0x7f028f6f86b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #19: + 0xd2631e (0x7f02a22e231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #20: + 0x47def4 (0x7f02a1a39ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55e95b65fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #21: + 0x1445a6 (0x562c75f825a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562c75f7ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #23: + 0x150866 (0x562c75f8e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x562c75f77142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562c75f82a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #26: PyObject_Call + 0xbc (0x562c75f8ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x562c75f752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562c75f82a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x562c75f738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #38: + 0x211239 (0x55e95b722239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #30: + 0x150582 (0x562c75f8e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x562c75f738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55e95b64ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55e95b64a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55e95b655a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55e95b645c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55e95b655a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55e95b6468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #45: + 0x150582 (0x55e95b661582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #46: PyObject_Call + 0xbc (0x55e95b661f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55e95b6482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #48: + 0x150582 (0x55e95b661582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #49: PyObject_Call + 0xbc (0x55e95b661f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55e95b6482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #32: + 0x150582 (0x562c75f8e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x562c75f738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #34: + 0x150582 (0x562c75f8e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x562c75f738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562c75f7af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562c75f8cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55e95b655a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #38: + 0x211239 (0x562c7604f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562c75f7ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x562c75f773e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55e95b64e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55e95b65fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562c75f82a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562c75f72c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #54: + 0x211239 (0x55e95b722239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562c75f82a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x562c75f738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #45: + 0x150582 (0x562c75f8e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #46: PyObject_Call + 0xbc (0x562c75f8ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #55: PyObject_Call + 0x207 (0x55e95b662067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x562c75f752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #48: + 0x150582 (0x562c75f8e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55e95b6482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #49: PyObject_Call + 0xbc (0x562c75f8ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #57: + 0x150582 (0x55e95b661582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55e95b6468fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #59: + 0x150582 (0x55e95b661582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x562c75f752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #60: PyObject_Call + 0xbc (0x55e95b661f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55e95b6482b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #62: + 0x150582 (0x55e95b661582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562c75f82a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #63: PyObject_Call + 0xbc (0x55e95b661f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank57]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562c75f7b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562c75f8cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #54: + 0x211239 (0x562c7604f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #55: PyObject_Call + 0x207 (0x562c75f8f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x562c75f752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #57: + 0x150582 (0x562c75f8e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x562c75f738fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #59: + 0x150582 (0x562c75f8e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #60: PyObject_Call + 0xbc (0x562c75f8ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x562c75f752b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #62: + 0x150582 (0x562c75f8e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #63: PyObject_Call + 0xbc (0x562c75f8ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank39]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank39]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd3a99fc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank39]: frame #1: + 0x5b3a23e (0x7fd3e351923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd3e3513c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd3e3513f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd3e3514fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd3e34c9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd3e34c9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd3e34c9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd3e34c9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd3aacd6189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd3aacdd610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd3aacfc978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #12: + 0x5adc309 (0x7fd3e34bb309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #13: + 0x5ae6f10 (0x7fd3e34c5f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #14: + 0x5ae6fa5 (0x7fd3e34c5fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #15: + 0x5124446 (0x7fd3e2b03446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #16: + 0x1acf4b8 (0x7fd3df4ae4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #17: + 0x5aee004 (0x7fd3e34cd004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #18: + 0x5af36b5 (0x7fd3e34d26b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #19: + 0xd2631e (0x7fd3f60bc31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #20: + 0x47def4 (0x7fd3f5813ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #21: + 0x1445a6 (0x559aee9cd5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559aee9c6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #23: + 0x150866 (0x559aee9d9866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559aee9c2142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559aee9cda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #26: PyObject_Call + 0xbc (0x559aee9d9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559aee9c02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559aee9cda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559aee9be8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #30: + 0x150582 (0x559aee9d9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559aee9be8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #32: + 0x150582 (0x559aee9d9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559aee9be8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #34: + 0x150582 (0x559aee9d9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559aee9be8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559aee9c5f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559aee9d7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #38: + 0x211239 (0x559aeea9a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559aee9c6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559aee9c23e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559aee9cda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559aee9bdc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559aee9cda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559aee9be8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #45: + 0x150582 (0x559aee9d9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #46: PyObject_Call + 0xbc (0x559aee9d9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559aee9c02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #48: + 0x150582 (0x559aee9d9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #49: PyObject_Call + 0xbc (0x559aee9d9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559aee9c02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559aee9cda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559aee9c6007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559aee9d7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #54: + 0x211239 (0x559aeea9a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #55: PyObject_Call + 0x207 (0x559aee9da067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559aee9c02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #57: + 0x150582 (0x559aee9d9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559aee9be8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #59: + 0x150582 (0x559aee9d9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #60: PyObject_Call + 0xbc (0x559aee9d9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559aee9c02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #62: + 0x150582 (0x559aee9d9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #63: PyObject_Call + 0xbc (0x559aee9d9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: Traceback (most recent call last): -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default1]:[rank33]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank36]: pipeline_state.run_communication() -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default4]:[rank36]: recv_activation_tensor = recv_activation() -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -n/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward - -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: Fi[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -le "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank49]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc9342f3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank49]: frame #1: + 0x5b3a23e (0x7fc96de1023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fc96de0ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fc96de0af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fc96de0bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc96ddc0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc96ddc0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc96ddc0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank49]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc96ddc0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fc9355cd189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fc9355d4610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fc9355f3978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #12: + 0x5adc309 (0x7fc96ddb2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #13: + 0x5ae6f10 (0x7fc96ddbcf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: frame #14: + 0x5ae6fa5 (0x7fc96ddbcfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #15: + 0x5124446 (0x7fc96d3fa446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #16: + 0x1acf4b8 (0x7fc969da54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #17: + 0x5aee004 (0x7fc96ddc4004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #18: + 0x5af36b5 (0x7fc96ddc96b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #19: + 0xd2631e (0x7fc9809b3[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: return func(*args, **kwargs) -[default1]:[rank49]: frame #20: + 0x47def4 (0x7fc98010aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #21: + 0x1445a6 (0x5644043105a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #22: _PyObject_MakeTpCall + 0x26b (0x564404309a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #23: + 0x150866 (0x56440431c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x564404305142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: frame #25: _PyFunction_Vectorcall + 0x6c (0x564404310a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #26: PyObject_Call + 0xbc (0x56440431cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5644043032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: frame #28: _PyFunction_Vectorcall + 0x6c (0x564404310a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5644043018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #30: + 0x150582 (0x56440431c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5644043018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #32: + 0x150582 (0x56440431c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5644043018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #34: + 0x150582 (0x56440431c582 in /fsx/ferdinandmom/miniforge3/envs/env[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication --bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5644043018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x564404308f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56440431ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #38: + 0x211239 (0x5644043dd239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #39: _PyObject_MakeTpCall + 0x26b (0x564404309a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5644043053e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #41: _PyFunction_Vectorcall + 0x6c (0x564404310a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x564404300c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #43: _PyFunction_Vectorcall + 0x6c (0x564404310a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5644043018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #45: + 0x150582 (0x56440431c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank36]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank49]: frame #46: PyObject_Call + 0xbc (0x56440431cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5644043032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #48: + 0x150582 (0x56440431c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #49: PyObject_Call + 0xbc (0x56440431cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5644043032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: frame #51: _PyFunction_Vectorcall + 0x6c (0x564404310a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x564404309007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56440431ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #54: + 0x211239 (0x5644043dd239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fad3372a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank49]: frame #55: PyObject_Call + 0x207 (0x56440431d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5644043032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #57: + 0x150582 (0x56440431c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5644043018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #1: + 0x5b3a23e (0x7fad6d24723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #59: + 0x150582 (0x56440431c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #60: PyObject_Call + 0xbc (0x56440431cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5644043032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #62: + 0x150582 (0x56440431c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: frame #63: PyObject_Call + 0xbc (0x56440431cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fad6d241c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fad6d241f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fad6d242fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fad6d1f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fad6d1f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fad6d1f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fad6d1f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fad34a04189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fad34a0b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank33]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank33]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f91c2d1a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank36]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fad34a2a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #1: + 0x5b3a23e (0x7f91fc83723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #12: + 0x5adc309 (0x7fad6d1e9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f91fc831c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #13: + 0x5ae6f10 (0x7fad6d1f3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f91fc831f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f91fc832fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #14: + 0x5ae6fa5 (0x7fad6d1f3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f91fc7e7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f91fc7e7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f91fc7e7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f91fc7e7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f91c3ff4189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #15: + 0x5124446 (0x7fad6c831446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #16: + 0x1acf4b8 (0x7fad691dc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #17: + 0x5aee004 (0x7fad6d1fb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f91c3ffb610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #18: + 0x5af36b5 (0x7fad6d2006b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f91c401a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #19: + 0xd2631e (0x7fad7fdea31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #12: + 0x5adc309 (0x7f91fc7d9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #20: + 0x47def4 (0x7fad7f541ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #13: + 0x5ae6f10 (0x7f91fc7e3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #21: + 0x1445a6 (0x5650e60965a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5650e608fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #23: + 0x150866 (0x5650e60a2866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #14: + 0x5ae6fa5 (0x7f91fc7e3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5650e608b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5650e6096a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #15: + 0x5124446 (0x7f91fbe21446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #26: PyObject_Call + 0xbc (0x5650e60a2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5650e60892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5650e6096a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5650e60878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #30: + 0x150582 (0x5650e60a2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #16: + 0x1acf4b8 (0x7f91f87cc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5650e60878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #32: + 0x150582 (0x5650e60a2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #17: + 0x5aee004 (0x7f91fc7eb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5650e60878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #18: + 0x5af36b5 (0x7f91fc7f06b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #19: + 0xd2631e (0x7f920f3da31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #34: + 0x150582 (0x5650e60a2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #20: + 0x47def4 (0x7f920eb31ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5650e60878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5650e608ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #21: + 0x1445a6 (0x55ee363d55a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5650e60a0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55ee363cea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #23: + 0x150866 (0x55ee363e1866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #38: + 0x211239 (0x5650e6163239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55ee363ca142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5650e608fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5650e608b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55ee363d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5650e6096a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #26: PyObject_Call + 0xbc (0x55ee363e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55ee363c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55ee363d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55ee363c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #30: + 0x150582 (0x55ee363e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55ee363c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5650e6086c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #32: + 0x150582 (0x55ee363e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5650e6096a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55ee363c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #34: + 0x150582 (0x55ee363e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55ee363c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55ee363cdf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55ee363dfc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #38: + 0x211239 (0x55ee364a2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55ee363cea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5650e60878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55ee363ca3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55ee363d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55ee363c5c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55ee363d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55ee363c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #45: + 0x150582 (0x55ee363e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #46: PyObject_Call + 0xbc (0x55ee363e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55ee363c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #48: + 0x150582 (0x55ee363e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #49: PyObject_Call + 0xbc (0x55ee363e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55ee363c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #45: + 0x150582 (0x5650e60a2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55ee363d5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #46: PyObject_Call + 0xbc (0x5650e60a2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55ee363ce007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5650e60892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55ee363dfc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #48: + 0x150582 (0x5650e60a2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #54: + 0x211239 (0x55ee364a2239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #55: PyObject_Call + 0x207 (0x55ee363e2067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55ee363c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #49: PyObject_Call + 0xbc (0x5650e60a2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #57: + 0x150582 (0x55ee363e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55ee363c68fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5650e60892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #59: + 0x150582 (0x55ee363e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5650e6096a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5650e608f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #60: PyObject_Call + 0xbc (0x55ee363e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5650e60a0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #54: + 0x211239 (0x5650e6163239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #55: PyObject_Call + 0x207 (0x5650e60a3067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5650e60892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55ee363c82b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #57: + 0x150582 (0x5650e60a2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5650e60878fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #62: + 0x150582 (0x55ee363e1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #59: + 0x150582 (0x5650e60a2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #63: PyObject_Call + 0xbc (0x55ee363e1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: frame #60: PyObject_Call + 0xbc (0x5650e60a2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5650e60892b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #62: + 0x150582 (0x5650e60a2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #63: PyObject_Call + 0xbc (0x5650e60a2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank48]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank48]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f13d9806897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank48]: frame #1: + 0x5b3a23e (0x7f141332323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f141331dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f141331df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f141331efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f14132d3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f14132d3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f14132d3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f14132d3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f13daae0189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f13daae7610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f13dab06978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #12: + 0x5adc309 (0x7f14132c5309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #13: + 0x5ae6f10 (0x7f14132cff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #14: + 0x5ae6fa5 (0x7f14132cffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #15: + 0x5124446 (0x7f141290d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #16: + 0x1acf4b8 (0x7f140f2b84b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #17: + 0x5aee004 (0x7f14132d7004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #18: + 0x5af36b5 (0x7f14132dc6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #19: + 0xd2631e (0x7f1425ec631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #20: + 0x47def4 (0x7f142561def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #21: + 0x1445a6 (0x563e6676d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563e66766a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #23: + 0x150866 (0x563e66779866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563e66762142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563e6676da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #26: PyObject_Call + 0xbc (0x563e66779f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563e667602b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563e6676da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563e6675e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #30: + 0x150582 (0x563e66779582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563e6675e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #32: + 0x150582 (0x563e66779582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563e6675e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #34: + 0x150582 (0x563e66779582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563e6675e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563e66765f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563e66777c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #38: + 0x211239 (0x563e6683a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563e66766a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563e667623e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563e6676da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563e6675dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563e6676da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563e6675e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #45: + 0x150582 (0x563e66779582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #46: PyObject_Call + 0xbc (0x563e66779f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563e667602b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #48: + 0x150582 (0x563e66779582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #49: PyObject_Call + 0xbc (0x563e66779f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563e667602b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563e6676da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563e66766007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563e66777c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #54: + 0x211239 (0x563e6683a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #55: PyObject_Call + 0x207 (0x563e6677a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563e667602b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #57: + 0x150582 (0x563e66779582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563e6675e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #59: + 0x150582 (0x563e66779582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #60: PyObject_Call + 0xbc (0x563e66779f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563e667602b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #62: + 0x150582 (0x563e66779582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #63: PyObject_Call + 0xbc (0x563e66779f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: Traceback (most recent call last): -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank50]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank50]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66cd479897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default2]:[rank50]: frame #1: + 0x5b3a23e (0x7f6706f9623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6706f90c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6706f90f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6706f91fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6706f46371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6706f46371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6706f46371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank50]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6706f46371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f66ce753189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f66ce75a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f66ce779978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: dist.recv( -[default2]:[rank50]: frame #12: + 0x5adc309 (0x7f6706f38309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: frame #13: + 0x5ae6f10 (0x7f6706f42f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank51]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank51]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5cd061e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank51]: frame #1: + 0x5b3a23e (0x7f5d0a13b23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5d0a135c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5d0a135f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5d0a136fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #14: + 0x5ae6fa5 (0x7f6706f42fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5d0a0eb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #15: + 0x5124446 (0x7f6706580446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #16: + 0x1acf4b8 (0x7f6702f2b4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #17: + 0x5aee004 (0x7f6706f4a004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #18: + 0x5af36b5 (0x7f6706f4f6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #19: + 0xd2631e (0x7f6719b3931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #20: + 0x47def4 (0x7f6719290ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #21: + 0x1445a6 (0x55ba1a0045a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55ba19ffda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #23: + 0x150866 (0x55ba1a010866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55ba19ff9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5d0a0eb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55ba1a004a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #26: PyObject_Call + 0xbc (0x55ba1a010f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5d0a0eb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f5d0a0eb371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5cd18f8189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55ba19ff72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55ba1a004a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55ba19ff58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5cd18ff610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #30: + 0x150582 (0x55ba1a010582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55ba19ff58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #32: + 0x150582 (0x55ba1a010582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55ba19ff58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5cd191e978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #34: + 0x150582 (0x55ba1a010582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55ba19ff58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #12: + 0x5adc309 (0x7f5d0a0dd309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55ba19ffcf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #13: + 0x5ae6f10 (0x7f5d0a0e7f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55ba1a00ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #38: + 0x211239 (0x55ba1a0d1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55ba19ffda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55ba19ff93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55ba1a004a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #14: + 0x5ae6fa5 (0x7f5d0a0e7fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #15: + 0x5124446 (0x7f5d09725446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #16: + 0x1acf4b8 (0x7f5d060d04b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #17: + 0x5aee004 (0x7f5d0a0ef004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default2]:[rank50]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55ba19ff4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55ba1a004a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55ba19ff58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #45: + 0x150582 (0x55ba1a010582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #46: PyObject_Call + 0xbc (0x55ba1a010f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55ba19ff72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #18: + 0x5af36b5 (0x7f5d0a0f46b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default3]:[rank51]: frame #19: + 0xd2631e (0x7f5d1ccde31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #20: + 0x47def4 (0x7f5d1c435ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in[default3]:[rank51]: frame #21: + 0x1445a6 (0x560c1bcbd5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #22: _PyObject_MakeTpCall + 0x26b (0x560c1bcb6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) - forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipe[default3]:[rank51]: frame #23: + 0x150866 (0x560c1bcc9866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -line_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: frame #48: + 0x150582 (0x55ba1a010582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #49: PyObject_Call + 0xbc (0x55ba1a010f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55ba19ff72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank32]: Exception raised from recvBytes at ../torch/csr[default2]:[rank50]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55ba1a004a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55ba19ffd007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -c/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank32]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41069a4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank32]: frame #1: + 0x5b3a23e (0x7f41404c123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f41404bbc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f41404bbf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f41404[default3]:[rank51]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x560c1bcb2142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #25: _PyFunction_Vectorcall + 0x6c (0x560c1bcbda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -bcfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4140471371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4140471371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4140471371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4140471371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLI[default3]:[rank51]: frame #26: PyObject_Call + 0xbc (0x560c1bcc9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x560c1bcb02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -D(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f4107c7e189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4107c85610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #28: _PyFunction_Vectorcall + 0x6c (0x560c1bcbda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4107ca4978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #12: + 0x5adc309 (0x7f4140463309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #13: + 0x5ae6f10 (0x7f414046df10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55ba1a00ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #54: + 0x211239 (0x55ba1a0d1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x560c1bcae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #55: PyObject_Call + 0x207 (0x55ba1a011067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #14: + 0x5ae6fa5 (0x7f414046dfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #30: + 0x150582 (0x560c1bcc9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #15: + 0x5124446 (0x7f413faab446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #16: + 0x1acf4b8 (0x7f413c4564b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #17: + 0x5aee004 (0x7f4140475004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #18: + 0x5af36b5 (0x7f414047a6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55ba19ff72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #19: + 0xd2631e (0x7f415306431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #20: + 0x47def4 (0x7f41527bbef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #21: + 0x1445a6 (0x557d896c35a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x560c1bcae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557d896bca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #23: + 0x150866 (0x557d896cf866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557d896b8142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557d896c3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #26: PyObject_Call + 0xbc (0x557d896cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x557d896b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557d896c3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x557d896b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #30: + 0x150582 (0x557d896cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x557d896b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #32: + 0x150582 (0x557d896cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x557d896b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #34: + 0x150582 (0x557d896cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x557d896b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557d896bbf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557d896cdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #38: + 0x211239 (0x557d89790239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557d896bca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x557d896b83e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557d896c3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557d896b3c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557d896c3a2c in /fsx/f[default3]:[rank51]: frame #32: + 0x150582 (0x560c1bcc9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -erdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x557d896b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x560c1bcae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #34: + 0x150582 (0x560c1bcc9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #45: + 0x150582 (0x557d896cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #46: PyObject_Call + 0xbc (0x557d896cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x557d896b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #48: + 0x150582 (0x557d896cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #49: PyObject_Call + 0xbc (0x557d896cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x557d896b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557d896c3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/pyt[default2]:[rank50]: frame #57: + 0x150582 (0x55ba1a010582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55ba19ff58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -hon3.10) -[default3]:[rank51]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x560c1bcae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x560c1bcb5f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557d896bc007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557d896cdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #54: + 0x211239 (0x557d89790239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #55: PyObject_Call + 0x207 (0x557d896d0067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x557d896b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #57: + 0x150582 (0x557d896cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #37: _PyObject_Call_Prepend + 0x69 (0x560c1bcc7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #38: + 0x211239 (0x560c1bd8a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x557d896b48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #59: + 0x150582 (0x557d896cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #60: PyObject_Call + 0xbc (0x557d896cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x557d896b62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #39: _PyObject_MakeTpCall + 0x26b (0x560c1bcb6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #62: + 0x150582 (0x557d896cf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #63: PyObject_Call + 0xbc (0x557d896cff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: frame #59: + 0x150582 (0x55ba1a010582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default3]:[rank51]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x560c1bcb23e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: frame #60: PyObject_Call + 0xbc (0x55ba1a010f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: frame #41: _PyFunction_Vectorcall + 0x6c (0x560c1bcbda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55ba19ff72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/benc[default3]:[rank51]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x560c1bcadc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -h_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: frame #62: + 0x150582 (0x55ba1a010582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: frame #43: _PyFunction_Vectorcall + 0x6c (0x560c1bcbda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: frame #63: PyObject_Call + 0xbc (0x55ba1a010f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank34]: dist.recv( -[default3]:[rank51]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x560c1bcae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank34]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc91c129897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank50]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: frame #1: + 0x5b3a23e (0x7fc955c4623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fc955c40c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fc955c40f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fc955c41fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc955bf6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/l[default3]:[rank51]: frame #45: + 0x150582 (0x560c1bcc9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #46: PyObject_Call + 0xbc (0x560c1bcc9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x560c1bcb02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc955bf6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc955bf6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc955bf6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fc91d403189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&,[default3]:[rank51]: frame #48: + 0x150582 (0x560c1bcc9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #49: PyObject_Call + 0xbc (0x560c1bcc9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x560c1bcb02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) - c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fc91d40a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fc91d429978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #12: + 0x5adc309 (0x7fc955be8309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #13: + 0x5ae6f10 (0x7fc955bf2f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #51: _PyFunction_Vectorcall + 0x6c (0x560c1bcbda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x560c1bcb6007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #53: _PyObject_Call_Prepend + 0x69 (0x560c1bcc7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #14: + 0x5ae6fa5 (0x7fc955bf2fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #15: + 0x5124446 (0x7fc955230446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #16: + 0x1acf4b8 (0x7fc951bdb4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #17: + 0x5aee004 (0x7fc955bfa004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #54: + 0x211239 (0x560c1bd8a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #55: PyObject_Call + 0x207 (0x560c1bcca067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x560c1bcb02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #57: + 0x150582 (0x560c1bcc9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #18: + 0x5af36b5 (0x7fc955bff6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #19: + 0xd2631e (0x7fc9687e931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #20: + 0x47def4 (0x7fc967f40ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #21: + 0x1445a6 (0x558a9b2b65a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x560c1bcae8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #59: + 0x150582 (0x560c1bcc9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #60: PyObject_Call + 0xbc (0x560c1bcc9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x560c1bcb02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #62: + 0x150582 (0x560c1bcc9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #63: PyObject_Call + 0xbc (0x560c1bcc9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #22: _PyObject_MakeTpCall + 0x26b (0x558a9b2afa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #23: + 0x150866 (0x558a9b2c2866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x558a9b2ab142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #25: _PyFunction_Vectorcall + 0x6c (0x558a9b2b6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #26: PyObject_Call + 0xbc (0x558a9b2c2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x558a9b2a92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #28: _PyFunction_Vectorcall + 0x6c (0x558a9b2b6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x558a9b2a78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #30: + 0x150582 (0x558a9b2c2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x558a9b2a78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #32: + 0x150582 (0x558a9b2c2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x558a9b2a78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #34: + 0x150582 (0x558a9b2c2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x558a9b2a78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x558a9b2aef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #37: _PyObject_Call_Prepend + 0x69 (0x558a9b2c0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #38: + 0x211239 (0x558a9b383239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #39: _PyObject_MakeTpCall + 0x26b (0x558a9b2afa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x558a9b2ab3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #41: _PyFunction_Vectorcall + 0x6c (0x558a9b2b6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x558a9b2a6c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #43: _PyFunction_Vectorcall + 0x6c (0x558a9b2b6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x558a9b2a78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #45: + 0x150582 (0x558a9b2c2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #46: PyObject_Call + 0xbc (0x558a9b2c2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x558a9b2a92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #48: + 0x150582 (0x558a9b2c2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #49: PyObject_Call + 0xbc (0x558a9b2c2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x558a9b2a92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: trainer.train(dataloader) -[default2]:[rank34]: frame #51: _PyFunction_Vectorcall + 0x6c (0x558a9b2b6a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x558a9b2af007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: frame #53: _PyObject_Call_Prepend + 0x69 (0x558a9b2c0c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: frame #54: + 0x211239 (0x558a9b383239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: frame #55: PyObject_Call + 0x207 (0x558a9b2c3067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x558a9b2a92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: frame #57: + 0x150582 (0x558a9b2c2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: output = model(**micro_batch) -[default2]:[rank34]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x558a9b2a78fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: frame #59: + 0x150582 (0x558a9b2c2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: frame #60: PyObject_Call + 0xbc (0x558a9b2c2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x558a9b2a92b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #62: + 0x150582 (0x558a9b2c2582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #63: PyObject_Call + 0xbc (0x558a9b2c2f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default2]:[rank34]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank35]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank35]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd0d3267897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: frame #1: + 0x5b3a23e (0x7fd10cd8423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd10cd7ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd10cd7ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd10cd7ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd10cd34371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd10cd34371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd10cd34371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd10cd34371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd0d4541189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd0d4548610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd0d4567978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #12: + 0x5adc309 (0x7fd10cd26309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #13: + 0x5ae6f10 (0x7fd10cd30f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #14: + 0x5ae6fa5 (0x7fd10cd30fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #15: + 0x5124446 (0x7fd10c36e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #16: + 0x1acf4b8 (0x7fd108d194b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #17: + 0x5aee004 (0x7fd10cd38004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #18: + 0x5af36b5 (0x7fd10cd3d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #19: + 0xd2631e (0x7fd11f92731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #20: + 0x47def4 (0x7fd11f07eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #21: + 0x1445a6 (0x555fb852d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555fb8526a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #23: + 0x150866 (0x555fb8539866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555fb8522142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555fb852da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #26: PyObject_Call + 0xbc (0x555fb8539f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555fb85202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555fb852da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555fb851e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #30: + 0x150582 (0x555fb8539582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555fb851e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #32: + 0x150582 (0x555fb8539582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555fb851e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #34: + 0x150582 (0x555fb8539582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555fb851e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555fb8525f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555fb8537c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #38: + 0x211239 (0x555fb85fa239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555fb8526a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555fb85223e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555fb852da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555fb851dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555fb852da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555fb851e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #45: + 0x150582 (0x555fb8539582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #46: PyObject_Call + 0xbc (0x555fb8539f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555fb85202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #48: + 0x150582 (0x555fb8539582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #49: PyObject_Call + 0xbc (0x555fb8539f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555fb85202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555fb852da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555fb8526007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555fb8537c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #54: + 0x211239 (0x555fb85fa239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #55: PyObject_Call + 0x207 (0x555fb853a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555fb85202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #57: + 0x150582 (0x555fb8539582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555fb851e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #59: + 0x150582 (0x555fb8539582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #60: PyObject_Call + 0xbc (0x555fb8539f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555fb85202b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #62: + 0x150582 (0x555fb8539582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #63: PyObject_Call + 0xbc (0x555fb8539f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank41]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc270c66897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]: frame #1: + 0x5b3a23e (0x7fc2aa78323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fc2aa77dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fc2aa77df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fc2aa77efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc2aa733371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc2aa733371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc2aa733371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fc2aa733371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fc271f40189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fc271f47610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fc271f66978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #12: + 0x5adc309 (0x7fc2aa725309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #13: + 0x5ae6f10 (0x7fc2aa72ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #14: + 0x5ae6fa5 (0x7fc2aa72ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #15: + 0x5124446 (0x7fc2a9d6d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #16: + 0x1acf4b8 (0x7fc2a67184b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #17: + 0x5aee004 (0x7fc2aa737004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #18: + 0x5af36b5 (0x7fc2aa73c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #19: + 0xd2631e (0x7fc2bd32631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank41]: frame #20: + 0x47def4 (0x7fc2bca7def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank41]: frame #21: + 0x1445a6 (0x563e61b0a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563e61b03a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #23: + 0x150866 (0x563e61b16866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563e61aff142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563e61b0aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #26: PyObject_Call + 0xbc (0x563e61b16f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563e61afd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563e61b0aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563e61afb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #30: + 0x150582 (0x563e61b16582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563e61afb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #32: + 0x150582 (0x563e61b16582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563e61afb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #34: + 0x150582 (0x563e61b16582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563e61afb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563e61b02f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563e61b14c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #38: + 0x211239 (0x563e61bd7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563e61b03a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563e61aff3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563e61b0aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563e61afac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563e61b0aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563e61afb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #45: + 0x150582 (0x563e61b16582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #46: PyObject_Call + 0xbc (0x563e61b16f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563e61afd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #48: + 0x150582 (0x563e61b16582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #49: PyObject_Call + 0xbc (0x563e61b16f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563e61afd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563e61b0aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563e61b03007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563e61b14c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #54: + 0x211239 (0x563e61bd7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #55: PyObject_Call + 0x207 (0x563e61b17067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563e61afd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #57: + 0x150582 (0x563e61b16582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563e61afb8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #59: + 0x150582 (0x563e61b16582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #60: PyObject_Call + 0xbc (0x563e61b16f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563e61afd2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #62: + 0x150582 (0x563e61b16582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #63: PyObject_Call + 0xbc (0x563e61b16f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: Traceback (most recent call last): -[default2]:[rank58]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default5]:[rank61]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank61]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4200a76897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]: frame #1: + 0x5b3a23e (0x7f423a59323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f423a58dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f423a58df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f423a58efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f423a543371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f423a543371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f423a543371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f423a543371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank58]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84a33df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank58]: frame #1: + 0x5b3a23e (0x7f84dcefc23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f84dcef6c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f84dcef6f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f84dcef7fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84dceac371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f4201d50189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4201d57610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4201d76978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #12: + 0x5adc309 (0x7f423a535309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #13: + 0x5ae6f10 (0x7f423a53ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84dceac371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84dceac371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f84dceac371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f84a46b9189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f84a46c0610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f84a46df978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #12: + 0x5adc309 (0x7f84dce9e309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #14: + 0x5ae6fa5 (0x7f423a53ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #13: + 0x5ae6f10 (0x7f84dcea8f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: frame #14: + 0x5ae6fa5 (0x7f84dcea8fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #15: + 0x5124446 (0x7f84dc4e6446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #16: + 0x1acf4b8 (0x7f84d8e914b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #15: + 0x5124446 (0x7f4239b7d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #16: + 0x1acf4b8 (0x7f42365284b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: trainer.train(dataloader) -[default2]:[rank58]: frame #17: + 0x5aee004 (0x7f84dceb0004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: frame #17: + 0x5aee004 (0x7f423a547004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #18: + 0x5af36b5 (0x7f423a54c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #19: + 0xd2631e (0x7f424d13631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #20: + 0x47def4 (0x7f424c88def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: frame #18: + 0x5af36b5 (0x7f84dceb56b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #19: + 0xd2631e (0x7f84efa9f31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #20: + 0x47def4 (0x7f84ef1f6ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: frame #21: + 0x1445a6 (0x5602d74455a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: frame #21: + 0x1445a6 (0x55e37d0895a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5602d743ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: frame #23: + 0x150866 (0x5602d7451866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55e37d082a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #23: + 0x150866 (0x55e37d095866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5602d743a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55e37d07e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55e37d089a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5602d7445a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: frame #26: PyObject_Call + 0xbc (0x5602d7451f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5602d74382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5602d7445a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5602d74368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #26: PyObject_Call + 0xbc (0x55e37d095f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #30: + 0x150582 (0x5602d7451582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55e37d07c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55e37d089a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5602d74368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #32: + 0x150582 (0x5602d7451582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55e37d07a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #30: + 0x150582 (0x55e37d095582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5602d74368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55e37d07a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #32: + 0x150582 (0x55e37d095582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55e37d07a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #34: + 0x150582 (0x5602d7451582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: sharded_logits = self.model( -[default2]:[rank58]: frame #34: + 0x150582 (0x55e37d095582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55e37d07a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5602d74368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55e37d081f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55e37d093c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #38: + 0x211239 (0x55e37d156239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55e37d082a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5602d743df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55e37d07e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55e37d089a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55e37d079c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5602d744fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: frame #38: + 0x211239 (0x5602d7512239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55e37d089a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55e37d07a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #45: + 0x150582 (0x55e37d095582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5602d743ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5602d743a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: frame #46: PyObject_Call + 0xbc (0x55e37d095f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5602d7445a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5602d7435c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55e37d07c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5602d7445a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #48: + 0x150582 (0x55e37d095582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #49: PyObject_Call + 0xbc (0x55e37d095f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5602d74368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55e37d07c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55e37d089a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55e37d082007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #45: + 0x150582 (0x5602d7451582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #46: PyObject_Call + 0xbc (0x5602d7451f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default5]:[rank61]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5602d74382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55e37d093c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: frame #54: + 0x211239 (0x55e37d156239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: frame #48: + 0x150582 (0x5602d7451582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #49: PyObject_Call + 0xbc (0x5602d7451f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5602d74382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5602d7445a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: frame #55: PyObject_Call + 0x207 (0x55e37d096067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5602d743e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55e37d07c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #57: + 0x150582 (0x55e37d095582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55e37d07a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank63]: dist.recv( -[default3]:[rank59]: Traceback (most recent call last): -[default2]:[rank58]: frame #59: + 0x150582 (0x55e37d095582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #60: PyObject_Call + 0xbc (0x55e37d095f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5602d744fc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55e37d07c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #54: + 0x211239 (0x5602d7512239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: frame #62: + 0x150582 (0x55e37d095582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return func(*args, **kwargs) -[default3]:[rank59]: trainer.train(dataloader) -[default2]:[rank58]: frame #63: PyObject_Call + 0xbc (0x55e37d095f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #55: PyObject_Call + 0x207 (0x5602d7452067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5602d74382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: frame #57: + 0x150582 (0x5602d7451582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5602d74368fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: frame #59: + 0x150582 (0x5602d7451582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: frame #60: PyObject_Call + 0xbc (0x5602d7451f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank61]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5602d74382b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank63]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6d1cf51897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #1: + 0x5b3a23e (0x7f6d56a6e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default7]:[rank63]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6d56a68c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #62: + 0x150582 (0x5602d7451582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #63: PyObject_Call + 0xbc (0x5602d7451f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank63]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6d56a68f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6d56a69fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6d56a1e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default7]:[rank63]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6d56a1e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -n/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states - -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6d56a1e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6d56a1e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank42]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most rece[default7]:[rank63]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6d1e22b189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -nt call first): -[default2]:[rank42]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c2a613897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #1: + 0x5b3a23e (0x7f2c6413023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2c6412ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2c6412af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6d1e232610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f2c6412bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2c640e0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2c640e0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2c640e0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2c640e0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6d1e251978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #12: + 0x5adc309 (0x7f6d56a10309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2c2b8ed189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2c2b8f4610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2c2b913978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: frame #12: + 0x5adc309 (0x7f2c640d2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #13: + 0x5ae6f10 (0x7f2c640dcf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: frame #13: + 0x5ae6f10 (0x7f6d56a1af10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #14: + 0x5ae6fa5 (0x7f2c640dcfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #15: + 0x5124446 (0x7f2c6371a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #16: + 0x1acf4b8 (0x7f2c600c54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #17: + 0x5aee004 (0x7f2c640e4004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: frame #18: + 0x5af36b5 (0x7f2c640e96b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #19: + 0xd2631e (0x7f2c76cd331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #20: + 0x47def4 (0x7f2c7642aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: pipeline_state.run_communication() -[default2]:[rank42]: frame #21: + 0x1445a6 (0x56018ace25a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56018acdba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #23: + 0x150866 (0x56018acee866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #14: + 0x5ae6fa5 (0x7f6d56a1afa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56018acd7142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56018ace2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #26: PyObject_Call + 0xbc (0x56018aceef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56018acd52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56018ace2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56018acd38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #15: + 0x5124446 (0x7f6d56058446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #16: + 0x1acf4b8 (0x7f6d52a034b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #17: + 0x5aee004 (0x7f6d56a22004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #30: + 0x150582 (0x56018acee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56018acd38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #32: + 0x150582 (0x56018acee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56018acd38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #34: + 0x150582 (0x56018acee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56018acd38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #18: + 0x5af36b5 (0x7f6d56a276b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #19: + 0xd2631e (0x7f6d6961131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: frame #20: + 0x47def4 (0x7f6d68d68ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56018acdaf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56018acecc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #21: + 0x1445a6 (0x557741fd85a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #38: + 0x211239 (0x56018adaf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56018acdba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56018acd73e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56018ace2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56018acd2c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56018ace2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56018acd38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: frame #45: + 0x150582 (0x56018acee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #46: PyObject_Call + 0xbc (0x56018aceef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56018acd52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557741fd1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #23: + 0x150866 (0x557741fe4866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #48: + 0x150582 (0x56018acee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #49: PyObject_Call + 0xbc (0x56018aceef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56018acd52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557741fcd142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56018ace2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56018acdb007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56018acecc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: frame #54: + 0x211239 (0x56018adaf239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #55: PyObject_Call + 0x207 (0x56018acef067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56018acd52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #57: + 0x150582 (0x56018acee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56018acd38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #59: + 0x150582 (0x56018acee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #60: PyObject_Call + 0xbc (0x56018aceef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/py[default7]:[rank63]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557741fd8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -thon3.10) -[default2]:[rank42]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56018acd52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #62: + 0x150582 (0x56018acee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #63: PyObject_Call + 0xbc (0x56018aceef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: frame #26: PyObject_Call + 0xbc (0x557741fe4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x557741fcb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557741fd8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank63]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x557741fc98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank63]: frame #30: + 0x150582 (0x557741fe4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f77f9c18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x557741fc98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #32: + 0x150582 (0x557741fe4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x557741fc98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #1: + 0x5b3a23e (0x7f783373523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #34: + 0x150582 (0x557741fe4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x557741fc98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557741fd0f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557741fe2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #38: + 0x211239 (0x5577420a5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f783372fc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f783372ff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7833730fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f78336e5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f78336e5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f78336e5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557741fd1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f78336e5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f77faef2189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x557741fcd3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f77faef9610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f77faf18978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #12: + 0x5adc309 (0x7f78336d7309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #13: + 0x5ae6f10 (0x7f78336e1f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557741fd8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #14: + 0x5ae6fa5 (0x7f78336e1fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557741fc8c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #15: + 0x5124446 (0x7f7832d1f446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #16: + 0x1acf4b8 (0x7f782f6ca4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557741fd8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x557741fc98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #17: + 0x5aee004 (0x7f78336e9004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #45: + 0x150582 (0x557741fe4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #46: PyObject_Call + 0xbc (0x557741fe4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #18: + 0x5af36b5 (0x7f78336ee6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x557741fcb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #19: + 0xd2631e (0x7f78462d831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: frame #48: + 0x150582 (0x557741fe4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #20: + 0x47def4 (0x7f7845a2fef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: frame #49: PyObject_Call + 0xbc (0x557741fe4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #21: + 0x1445a6 (0x556d6b1565a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x557741fcb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557741fd8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #22: _PyObject_MakeTpCall + 0x26b (0x556d6b14fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557741fd1007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557741fe2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #54: + 0x211239 (0x5577420a5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #55: PyObject_Call + 0x207 (0x557741fe5067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x557741fcb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #57: + 0x150582 (0x557741fe4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x557741fc98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #59: + 0x150582 (0x557741fe4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #60: PyObject_Call + 0xbc (0x557741fe4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x557741fcb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #62: + 0x150582 (0x557741fe4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #23: + 0x150866 (0x556d6b162866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #63: PyObject_Call + 0xbc (0x557741fe4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x556d6b14b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #25: _PyFunction_Vectorcall + 0x6c (0x556d6b156a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #26: PyObject_Call + 0xbc (0x556d6b162f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x556d6b1492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #28: _PyFunction_Vectorcall + 0x6c (0x556d6b156a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x556d6b1478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #30: + 0x150582 (0x556d6b162582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x556d6b1478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #32: + 0x150582 (0x556d6b162582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x556d6b1478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #34: + 0x150582 (0x556d6b162582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x556d6b1478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x556d6b14ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #37: _PyObject_Call_Prepend + 0x69 (0x556d6b160c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #38: + 0x211239 (0x556d6b223239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #39: _PyObject_MakeTpCall + 0x26b (0x556d6b14fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x556d6b14b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #41: _PyFunction_Vectorcall + 0x6c (0x556d6b156a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x556d6b146c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #43: _PyFunction_Vectorcall + 0x6c (0x556d6b156a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x556d6b1478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #45: + 0x150582 (0x556d6b162582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #46: PyObject_Call + 0xbc (0x556d6b162f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x556d6b1492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #48: + 0x150582 (0x556d6b162582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #49: PyObject_Call + 0xbc (0x556d6b162f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x556d6b1492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #51: _PyFunction_Vectorcall + 0x6c (0x556d6b156a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x556d6b14f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #53: _PyObject_Call_Prepend + 0x69 (0x556d6b160c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #54: + 0x211239 (0x556d6b223239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #55: PyObject_Call + 0x207 (0x556d6b163067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x556d6b1492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #57: + 0x150582 (0x556d6b162582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x556d6b1478fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #59: + 0x150582 (0x556d6b162582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #60: PyObject_Call + 0xbc (0x556d6b162f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x556d6b1492b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #62: + 0x150582 (0x556d6b162582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #63: PyObject_Call + 0xbc (0x556d6b162f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank60]: Traceback (most recent call last): -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: output = model(**micro_batch) -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: output = model(**micro_batch) -[default6]:[rank62]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: Traceback (most recent call last): -[default7]:[rank47]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: trainer.train(dataloader) -[default6]:[rank62]: dist.recv( -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: output = model(**micro_batch) -[default6]:[rank62]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default6]:[rank62]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa24ff8d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: frame #1: + 0x5b3a23e (0x7fa289aaa23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa289aa4c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa289aa4f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: output = model(**micro_batch) -[default6]:[rank62]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa289aa5fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa289a5a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa289a5a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: pipeline_state.run_communication() -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa289a5a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa289a5a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa251267189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa25126e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa25128d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #12: + 0x5adc309 (0x7fa289a4c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #13: + 0x5ae6f10 (0x7fa289a56f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: frame #14: + 0x5ae6fa5 (0x7fa289a56fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default6]:[rank62]: frame #15: + 0x5124446 (0x7fa289094446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #16: + 0x1acf4b8 (0x7fa285a3f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: frame #17: + 0x5aee004 (0x7fa289a5e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: dist.recv( -[default6]:[rank62]: frame #18: + 0x5af36b5 (0x7fa289a636b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: frame #19: + 0xd2631e (0x7fa29c64d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: frame #20: + 0x47def4 (0x7fa29bda4ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: return func(*args, **kwargs) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: frame #21: + 0x1445a6 (0x5646d09715a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5646d096aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: frame #23: + 0x150866 (0x5646d097d866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5646d0966142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5646d0971a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #26: PyObject_Call + 0xbc (0x5646d097df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5646d09642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5646d0971a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5646d09628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #30: + 0x150582 (0x5646d097d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5646d09628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #32: + 0x150582 (0x5646d097d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: sharded_logits = self.model( -[default4]:[rank60]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank60]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f17a72bf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: output = model(**micro_batch) -[default4]:[rank60]: frame #1: + 0x5b3a23e (0x7f17e0ddc23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f17e0dd6c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5646d09628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f17e0dd6f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f17e0dd7fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: output = model(**micro_batch) -[default6]:[rank62]: frame #34: + 0x150582 (0x5646d097d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5646d09628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17e0d8c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default6]:[rank62]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5646d0969f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5646d097bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17e0d8c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: frame #38: + 0x211239 (0x5646d0a3e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17e0d8c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5646d096aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5646d09663e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5646d0971a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5646d0961c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f17e0d8c371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f17a8599189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5646d0971a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5646d09628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #45: + 0x150582 (0x5646d097d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #46: PyObject_Call + 0xbc (0x5646d097df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f17a85a0610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f17a85bf978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #12: + 0x5adc309 (0x7f17e0d7e309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5646d09642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #13: + 0x5ae6f10 (0x7f17e0d88f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: frame #48: + 0x150582 (0x5646d097d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default6]:[rank62]: frame #49: PyObject_Call + 0xbc (0x5646d097df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: frame #14: + 0x5ae6fa5 (0x7f17e0d88fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5646d09642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5646d0971a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #15: + 0x5124446 (0x7f17e03c6446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5646d096a007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: frame #16: + 0x1acf4b8 (0x7f17dcd714b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5646d097bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #54: + 0x211239 (0x5646d0a3e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: frame #55: PyObject_Call + 0x207 (0x5646d097e067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5646d09642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: sharded_logits = self.model( -[default6]:[rank62]: frame #57: + 0x150582 (0x5646d097d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5646d09628fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: frame #59: + 0x150582 (0x5646d097d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #60: PyObject_Call + 0xbc (0x5646d097df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5646d09642b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #17: + 0x5aee004 (0x7f17e0d90004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: frame #62: + 0x150582 (0x5646d097d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: frame #63: PyObject_Call + 0xbc (0x5646d097df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: frame #18: + 0x5af36b5 (0x7f17e0d956b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #19: + 0xd2631e (0x7f17f397f31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: frame #20: + 0x47def4 (0x7f17f30d6ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #21: + 0x1445a6 (0x55b5a95d75a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b5a95d0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #23: + 0x150866 (0x55b5a95e3866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b5a95cc142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b5a95d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #26: PyObject_Call + 0xbc (0x55b5a95e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5a95ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b5a95d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b5a95c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #30: + 0x150582 (0x55b5a95e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b5a95c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #32: + 0x150582 (0x55b5a95e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b5a95c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #34: + 0x150582 (0x55b5a95e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b5a95c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b5a95cff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b5a95e1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #38: + 0x211239 (0x55b5a96a4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b5a95d0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b5a95cc3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b5a95d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b5a95c7c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b5a95d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b5a95c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #45: + 0x150582 (0x55b5a95e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #46: PyObject_Call + 0xbc (0x55b5a95e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default4]:[rank60]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5a95ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #48: + 0x150582 (0x55b5a95e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #49: PyObject_Call + 0xbc (0x55b5a95e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5a95ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b5a95d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b5a95d0007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b5a95e1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: frame #54: + 0x211239 (0x55b5a96a4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #55: PyObject_Call + 0x207 (0x55b5a95e4067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5a95ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: pipeline_state.run_communication() -[default4]:[rank60]: frame #57: + 0x150582 (0x55b5a95e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b5a95c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #59: + 0x150582 (0x55b5a95e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #60: PyObject_Call + 0xbc (0x55b5a95e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b5a95ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #62: + 0x150582 (0x55b5a95e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #63: PyObject_Call + 0xbc (0x55b5a95e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/pyt[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -hon3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default0]:[rank40]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank43]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f618ddaa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: frame #1: + 0x5b3a23e (0x7f61c78c723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank43]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f61c78c1c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0852fa2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: dist.recv( -[default3]:[rank43]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f61c78c1f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #1: + 0x5b3a23e (0x7f088cabf23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7a0d442897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f088cab9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #1: + 0x5b3a23e (0x7f7a46f5f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f7a46f59c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f088cab9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: return func(*args, **kwargs) -[default0]:[rank40]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f7a46f59f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f088cabafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f61c78c2fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f7a46f5afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f088ca6f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f61c7877371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f088ca6f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7a46f0f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f088ca6f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f088ca6f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank40]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7a46f0f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f085427c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f61c7877371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7a46f0f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0854283610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank43]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f61c7877371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f7a46f0f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2fa9c7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank43]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f61c7877371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f7a0e71c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #1: + 0x5b3a23e (0x7fa3344e423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f08542a2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f618f084189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa3344dec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f7a0e723610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #12: + 0x5adc309 (0x7f088ca61309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f618f08b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f7a0e742978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #13: + 0x5ae6f10 (0x7f088ca6bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f618f0aa978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa3344def82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #12: + 0x5adc309 (0x7f7a46f01309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #14: + 0x5ae6fa5 (0x7f088ca6bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa3344dffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #12: + 0x5adc309 (0x7f61c7869309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #15: + 0x5124446 (0x7f088c0a9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa334494371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #13: + 0x5ae6f10 (0x7f61c7873f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #13: + 0x5ae6f10 (0x7f7a46f0bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #16: + 0x1acf4b8 (0x7f0888a544b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #14: + 0x5ae6fa5 (0x7f61c7873fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa334494371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #14: + 0x5ae6fa5 (0x7f7a46f0bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #15: + 0x5124446 (0x7f61c6eb1446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #17: + 0x5aee004 (0x7f088ca73004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #15: + 0x5124446 (0x7f7a46549446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa334494371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #18: + 0x5af36b5 (0x7f088ca786b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #16: + 0x1acf4b8 (0x7f7a42ef44b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #16: + 0x1acf4b8 (0x7f61c385c4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #19: + 0xd2631e (0x7f089f66231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #17: + 0x5aee004 (0x7f61c787b004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #20: + 0x47def4 (0x7f089edb9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa334494371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #18: + 0x5af36b5 (0x7f61c78806b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #17: + 0x5aee004 (0x7f7a46f13004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa2fbca1189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #18: + 0x5af36b5 (0x7f7a46f186b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #19: + 0xd2631e (0x7f61da46a31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #21: + 0x1445a6 (0x56538ab505a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #19: + 0xd2631e (0x7f7a59b0231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #20: + 0x47def4 (0x7f61d9bc1ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56538ab49a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #20: + 0x47def4 (0x7f7a59259ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #21: + 0x1445a6 (0x55a86cfc35a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a86cfbca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #23: + 0x150866 (0x56538ab5c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa2fbca8610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #21: + 0x1445a6 (0x55ffae5175a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56538ab45142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa2fbcc7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56538ab50a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #12: + 0x5adc309 (0x7fa334486309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55ffae510a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #23: + 0x150866 (0x55a86cfcf866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #13: + 0x5ae6f10 (0x7fa334490f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #26: PyObject_Call + 0xbc (0x56538ab5cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56538ab432b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a86cfb8142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a86cfc3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56538ab50a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #26: PyObject_Call + 0xbc (0x55a86cfcff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56538ab418fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #30: + 0x150582 (0x56538ab5c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #23: + 0x150866 (0x55ffae523866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a86cfb62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55ffae50c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a86cfc3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #14: + 0x5ae6fa5 (0x7fa334490fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55ffae517a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56538ab418fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #15: + 0x5124446 (0x7fa333ace446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #26: PyObject_Call + 0xbc (0x55ffae523f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #32: + 0x150582 (0x56538ab5c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #16: + 0x1acf4b8 (0x7fa3304794b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55ffae50a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56538ab418fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #17: + 0x5aee004 (0x7fa334498004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a86cfb48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #34: + 0x150582 (0x56538ab5c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55ffae517a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56538ab418fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #30: + 0x150582 (0x55a86cfcf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55ffae5088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56538ab48f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a86cfb48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #30: + 0x150582 (0x55ffae523582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #32: + 0x150582 (0x55a86cfcf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56538ab5ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #38: + 0x211239 (0x56538ac1d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55ffae5088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #32: + 0x150582 (0x55ffae523582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56538ab49a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a86cfb48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #18: + 0x5af36b5 (0x7fa33449d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55ffae5088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #34: + 0x150582 (0x55a86cfcf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #19: + 0xd2631e (0x7fa34708731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56538ab453e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #34: + 0x150582 (0x55ffae523582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56538ab50a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #20: + 0x47def4 (0x7fa3467deef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55ffae5088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56538ab40c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56538ab50a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #21: + 0x1445a6 (0x556080a105a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #22: _PyObject_MakeTpCall + 0x26b (0x556080a09a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56538ab418fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a86cfb48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55ffae50ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55ffae521c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #45: + 0x150582 (0x56538ab5c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #23: + 0x150866 (0x556080a1c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a86cfbbf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #38: + 0x211239 (0x55ffae5e4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x556080a05142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a86cfcdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55ffae510a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #25: _PyFunction_Vectorcall + 0x6c (0x556080a10a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #38: + 0x211239 (0x55a86d090239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55ffae50c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #26: PyObject_Call + 0xbc (0x556080a1cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a86cfbca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a86cfb83e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #46: PyObject_Call + 0xbc (0x56538ab5cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55ffae517a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x556080a032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a86cfc3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55ffae507c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56538ab432b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a86cfb3c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55ffae517a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55ffae5088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #48: + 0x150582 (0x56538ab5c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #28: _PyFunction_Vectorcall + 0x6c (0x556080a10a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a86cfc3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #45: + 0x150582 (0x55ffae523582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x556080a018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #49: PyObject_Call + 0xbc (0x56538ab5cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #30: + 0x150582 (0x556080a1c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x556080a018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a86cfb48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56538ab432b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #46: PyObject_Call + 0xbc (0x55ffae523f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #45: + 0x150582 (0x55a86cfcf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56538ab50a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56538ab49007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #32: + 0x150582 (0x556080a1c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #46: PyObject_Call + 0xbc (0x55a86cfcff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56538ab5ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x556080a018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a86cfb62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55ffae50a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #34: + 0x150582 (0x556080a1c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #48: + 0x150582 (0x55a86cfcf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #48: + 0x150582 (0x55ffae523582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #54: + 0x211239 (0x56538ac1d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x556080a018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x556080a08f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #49: PyObject_Call + 0xbc (0x55ffae523f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #55: PyObject_Call + 0x207 (0x56538ab5d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56538ab432b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #37: _PyObject_Call_Prepend + 0x69 (0x556080a1ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #49: PyObject_Call + 0xbc (0x55a86cfcff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55ffae50a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #38: + 0x211239 (0x556080add239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a86cfb62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55ffae517a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #39: _PyObject_MakeTpCall + 0x26b (0x556080a09a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a86cfc3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #57: + 0x150582 (0x56538ab5c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x556080a053e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a86cfbc007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a86cfcdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56538ab418fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #41: _PyFunction_Vectorcall + 0x6c (0x556080a10a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #54: + 0x211239 (0x55a86d090239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x556080a00c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #59: + 0x150582 (0x56538ab5c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #55: PyObject_Call + 0x207 (0x55a86cfd0067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #60: PyObject_Call + 0xbc (0x56538ab5cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a86cfb62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55ffae510007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #43: _PyFunction_Vectorcall + 0x6c (0x556080a10a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #57: + 0x150582 (0x55a86cfcf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55ffae521c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56538ab432b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #54: + 0x211239 (0x55ffae5e4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x556080a018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #62: + 0x150582 (0x56538ab5c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a86cfb48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #45: + 0x150582 (0x556080a1c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #59: + 0x150582 (0x55a86cfcf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #46: PyObject_Call + 0xbc (0x556080a1cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #55: PyObject_Call + 0x207 (0x55ffae524067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #60: PyObject_Call + 0xbc (0x55a86cfcff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #63: PyObject_Call + 0xbc (0x56538ab5cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55ffae50a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #57: + 0x150582 (0x55ffae523582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank40]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a86cfb62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55ffae5088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x556080a032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #62: + 0x150582 (0x55a86cfcf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #59: + 0x150582 (0x55ffae523582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #63: PyObject_Call + 0xbc (0x55a86cfcff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank43]: frame #60: PyObject_Call + 0xbc (0x55ffae523f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #48: + 0x150582 (0x556080a1c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #49: PyObject_Call + 0xbc (0x556080a1cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x556080a032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55ffae50a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #62: + 0x150582 (0x55ffae523582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #63: PyObject_Call + 0xbc (0x55ffae523f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank44]: frame #51: _PyFunction_Vectorcall + 0x6c (0x556080a10a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x556080a09007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #53: _PyObject_Call_Prepend + 0x69 (0x556080a1ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #54: + 0x211239 (0x556080add239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #55: PyObject_Call + 0x207 (0x556080a1d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x556080a032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #57: + 0x150582 (0x556080a1c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x556080a018fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #59: + 0x150582 (0x556080a1c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #60: PyObject_Call + 0xbc (0x556080a1cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x556080a032b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #62: + 0x150582 (0x556080a1c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #63: PyObject_Call + 0xbc (0x556080a1cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank45]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f371456d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]: frame #1: + 0x5b3a23e (0x7f374e08a23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f374e084c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f374e084f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f374e085fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f374e03a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f374e03a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f374e03a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f374e03a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f3715847189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f371584e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f371586d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #12: + 0x5adc309 (0x7f374e02c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #13: + 0x5ae6f10 (0x7f374e036f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #14: + 0x5ae6fa5 (0x7f374e036fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #15: + 0x5124446 (0x7f374d674446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #16: + 0x1acf4b8 (0x7f374a01f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #17: + 0x5aee004 (0x7f374e03e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #18: + 0x5af36b5 (0x7f374e0436b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #19: + 0xd2631e (0x7f3760c2d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #20: + 0x47def4 (0x7f3760384ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #21: + 0x1445a6 (0x56243426a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562434263a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #23: + 0x150866 (0x562434276866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56243425f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56243426aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #26: PyObject_Call + 0xbc (0x562434276f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56243425d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56243426aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56243425b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #30: + 0x150582 (0x562434276582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56243425b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #32: + 0x150582 (0x562434276582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56243425b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #34: + 0x150582 (0x562434276582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56243425b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562434262f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562434274c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #38: + 0x211239 (0x562434337239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562434263a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56243425f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56243426aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56243425ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56243426aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56243425b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #45: + 0x150582 (0x562434276582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #46: PyObject_Call + 0xbc (0x562434276f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56243425d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #48: + 0x150582 (0x562434276582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #49: PyObject_Call + 0xbc (0x562434276f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56243425d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56243426aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562434263007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562434274c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #54: + 0x211239 (0x562434337239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #55: PyObject_Call + 0x207 (0x562434277067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56243425d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #57: + 0x150582 (0x562434276582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56243425b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #59: + 0x150582 (0x562434276582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #60: PyObject_Call + 0xbc (0x562434276f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56243425d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #62: + 0x150582 (0x562434276582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #63: PyObject_Call + 0xbc (0x562434276f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank37]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6833cea897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank37]: frame #1: + 0x5b3a23e (0x7f686d80723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f686d801c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f686d801f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f686d802fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f686d7b7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f686d7b7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f686d7b7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f686d7b7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6834fc4189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6834fcb610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6834fea978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #12: + 0x5adc309 (0x7f686d7a9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #13: + 0x5ae6f10 (0x7f686d7b3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #14: + 0x5ae6fa5 (0x7f686d7b3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #15: + 0x5124446 (0x7f686cdf1446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #16: + 0x1acf4b8 (0x7f686979c4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #17: + 0x5aee004 (0x7f686d7bb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #18: + 0x5af36b5 (0x7f686d7c06b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #19: + 0xd2631e (0x7f68803aa31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #20: + 0x47def4 (0x7f687fb01ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #21: + 0x1445a6 (0x55a84168f5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a841688a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #23: + 0x150866 (0x55a84169b866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a841684142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a84168fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #26: PyObject_Call + 0xbc (0x55a84169bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a8416822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a84168fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a8416808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #30: + 0x150582 (0x55a84169b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a8416808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #32: + 0x150582 (0x55a84169b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a8416808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #34: + 0x150582 (0x55a84169b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a8416808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a841687f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a841699c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #38: + 0x211239 (0x55a84175c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a841688a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a8416843e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a84168fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a84167fc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a84168fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a8416808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #45: + 0x150582 (0x55a84169b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #46: PyObject_Call + 0xbc (0x55a84169bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a8416822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #48: + 0x150582 (0x55a84169b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #49: PyObject_Call + 0xbc (0x55a84169bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a8416822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a84168fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a841688007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a841699c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #54: + 0x211239 (0x55a84175c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #55: PyObject_Call + 0x207 (0x55a84169c067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a8416822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #57: + 0x150582 (0x55a84169b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a8416808fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #59: + 0x150582 (0x55a84169b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #60: PyObject_Call + 0xbc (0x55a84169bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a8416822b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #62: + 0x150582 (0x55a84169b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #63: PyObject_Call + 0xbc (0x55a84169bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: . This may indicate a possible application crash on rank 0 or a network set up issue. -W0703 00:33:30.405000 140629515450176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1108347 closing signal SIGTERM -W0703 00:33:30.405000 140629515450176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1108348 closing signal SIGTERM -W0703 00:33:30.405000 140629515450176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1108350 closing signal SIGTERM -E0703 00:33:31.466000 140629515450176 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1108345) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:33:30 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1108346) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1108346 -[2]: - time : 2024-07-03_00:33:30 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1108349) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1108349 -[3]: - time : 2024-07-03_00:33:30 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1108351) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1108351 -[4]: - time : 2024-07-03_00:33:30 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1108352) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1108352 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:33:30 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1108345) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1108345 -============================================================ -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -W0703 00:33:34.132000 139844076955392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_873734_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:34.304000 139711063140096 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1832456_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:34.837000 140439604508416 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1804798_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:34.877000 139980002621184 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1032079_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:35.429000 139716723873600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1832530 closing signal SIGTERM -W0703 00:33:35.429000 139716723873600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1832531 closing signal SIGTERM -W0703 00:33:35.429000 139716723873600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1832532 closing signal SIGTERM -W0703 00:33:35.429000 139716723873600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1832533 closing signal SIGTERM -W0703 00:33:35.430000 139716723873600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1832534 closing signal SIGTERM -W0703 00:33:35.430000 139716723873600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1832535 closing signal SIGTERM -W0703 00:33:35.433000 139716723873600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1832536 closing signal SIGTERM -W0703 00:33:35.434000 139716723873600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1832537 closing signal SIGTERM -W0703 00:33:35.437000 139985663354688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1032152 closing signal SIGTERM -W0703 00:33:35.437000 139985663354688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1032153 closing signal SIGTERM -W0703 00:33:35.437000 139985663354688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1032154 closing signal SIGTERM -W0703 00:33:35.438000 139985663354688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1032155 closing signal SIGTERM -W0703 00:33:35.440000 139985663354688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1032156 closing signal SIGTERM -W0703 00:33:35.440000 139985663354688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1032157 closing signal SIGTERM -W0703 00:33:35.440000 139985663354688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1032158 closing signal SIGTERM -W0703 00:33:35.441000 139985663354688 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1032159 closing signal SIGTERM -W0703 00:33:35.491000 140445265241920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1804872 closing signal SIGTERM -W0703 00:33:35.491000 140445265241920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1804873 closing signal SIGTERM -W0703 00:33:35.491000 140445265241920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1804874 closing signal SIGTERM -W0703 00:33:35.493000 140445265241920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1804875 closing signal SIGTERM -W0703 00:33:35.493000 140445265241920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1804876 closing signal SIGTERM -W0703 00:33:35.493000 140445265241920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1804877 closing signal SIGTERM -W0703 00:33:35.493000 140445265241920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1804878 closing signal SIGTERM -W0703 00:33:35.496000 140445265241920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1804879 closing signal SIGTERM -W0703 00:33:35.503000 139849737688896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873807 closing signal SIGTERM -W0703 00:33:35.503000 139849737688896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873808 closing signal SIGTERM -W0703 00:33:35.504000 139849737688896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873809 closing signal SIGTERM -W0703 00:33:35.505000 139849737688896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873810 closing signal SIGTERM -W0703 00:33:35.505000 139849737688896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873811 closing signal SIGTERM -W0703 00:33:35.506000 139849737688896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873812 closing signal SIGTERM -W0703 00:33:35.506000 139849737688896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873813 closing signal SIGTERM -W0703 00:33:35.506000 139849737688896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873814 closing signal SIGTERM -W0703 00:33:39.137000 139844076955392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_873734_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:39.309000 139711063140096 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1832456_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:39.645000 139849737688896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_873734_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:39.659000 139849737688896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_873734_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 00:33:39.841000 140439604508416 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1804798_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:39.882000 139980002621184 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1032079_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -W0703 00:33:42.960000 139985663354688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1032079_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:42.976000 139985663354688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1032079_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 00:33:43.535000 140445265241920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1804798_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:43.548000 140445265241920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1804798_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 00:33:43.564000 139716723873600 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1832456_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:33:43.582000 139716723873600 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1832456_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-128/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/bench.slurm deleted file mode 100644 index de5e011cd073a71f8dd9a9d47cdf5e914040cdf3..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/config.yaml deleted file mode 100644 index b0d7f5b6ae3b9c8d23eefe3027908deddd021152..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 32 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 16 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/log.out deleted file mode 100644 index 85fa1c1ca443be23b46f435d002795be4299a7a3..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/log.out +++ /dev/null @@ -1,5803 +0,0 @@ -======================== -START TIME: Wed Jul 3 01:22:40 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 01:22:46.347000 140303926261568 torch/distributed/run.py:757] -W0703 01:22:46.347000 140303926261568 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.347000 140303926261568 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:22:46.347000 140303926261568 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.481000 139920171444032 torch/distributed/run.py:757] -W0703 01:22:46.481000 139920171444032 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.481000 139920171444032 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:22:46.481000 139920171444032 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.533000 140692406683456 torch/distributed/run.py:757] -W0703 01:22:46.533000 140692406683456 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.533000 140692406683456 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:22:46.533000 140692406683456 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.693000 139929884014400 torch/distributed/run.py:757] -W0703 01:22:46.693000 139929884014400 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.693000 139929884014400 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:22:46.693000 139929884014400 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.870000 140222101264192 torch/distributed/run.py:757] -W0703 01:22:46.870000 140222101264192 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.870000 140222101264192 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:22:46.870000 140222101264192 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.910000 140284826212160 torch/distributed/run.py:757] -W0703 01:22:46.910000 140284826212160 torch/distributed/run.py:757] ***************************************** -W0703 01:22:46.910000 140284826212160 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:22:46.910000 140284826212160 torch/distributed/run.py:757] ***************************************** -W0703 01:22:47.062000 140380777928512 torch/distributed/run.py:757] -W0703 01:22:47.062000 140380777928512 torch/distributed/run.py:757] ***************************************** -W0703 01:22:47.062000 140380777928512 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:22:47.062000 140380777928512 torch/distributed/run.py:757] ***************************************** -W0703 01:22:47.101000 140016183863104 torch/distributed/run.py:757] -W0703 01:22:47.101000 140016183863104 torch/distributed/run.py:757] ***************************************** -W0703 01:22:47.101000 140016183863104 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:22:47.101000 140016183863104 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 01:23:12 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=2, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=16, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=32, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16')), -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 01:23:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default1]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=9|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=10|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=11|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=4|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=7|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=14|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=8|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=12|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=13|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=6|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=5|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 01:23:30 [INFO|DP=1|PP=1|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 01:23:30 [INFO|DP=1|PP=0|TP=15|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 01:23:30 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 01:23:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 01:23:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 01:23:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 01:23:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 01:23:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 01:23:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 01:23:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:23:34 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:23:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 01:23:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 01:23:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 01:23:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 01:23:35.981752 | mbs: 16 | grad_accum: 32 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 01:23:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 01:23:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default5]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=2|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=6|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=1|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=9|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=5|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=7|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=14|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=12|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=8|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=6|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=15|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=3|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=4|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=4|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=13|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=5|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=0|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:23:36 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:23:36 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:23:36 [WARNING|DP=1|PP=1|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:23:36 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank30]: dist.recv( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank24]: dist.recv( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: sharded_logits = self.model( -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: dist.recv( -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/para[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -llel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank6]: dist.recv( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f71a211a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f71a33f3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f71a33f8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f71a33f9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f71eee92e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f71f3ed9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f71f3ca4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default5]:[rank29]: Traceback (most recent call last): -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f71a211a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f71a33f3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f71a33f8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f71a33f9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default7]:frame #4: + 0xd3e95 (0x7f71eee92e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f71f3ed9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:frame #6: clone + 0x43 (0x7f71f3ca4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: return user_fn(self, *args) -[default5]:[rank29]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default7]: -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f71a211a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: _engine_run_backward( -[default7]:frame #1: + 0xe32119 (0x7f71a307d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f71eee92e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:frame #3: + 0x8609 (0x7f71f3ed9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:frame #4: clone + 0x43 (0x7f71f3ca4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: dist.recv( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff19a547897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff19b820c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff19b825a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff19b826dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff1e72bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff1ec306609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff1ec0d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff19a547897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff19b820c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff19b825a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff19b826dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff1e72bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff1ec306609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff1ec0d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff19a547897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7ff19b4aa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7ff1e72bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7ff1ec306609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7ff1ec0d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8ef334897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8f060dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8f0612a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8f0613dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa93c0ace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa9410f3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa940ebe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8ef334897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8f060dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8f0612a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8f0613dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa93c0ace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa9410f3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa940ebe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8ef334897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fa8f0297119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fa93c0ace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fa9410f3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fa940ebe353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcfce7e2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcfcfabbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcfcfac0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcfcfac1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd01b55ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd0205a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd02036c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcfce7e2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcfcfabbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcfcfac0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcfcfac1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd01b55ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd0205a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd02036c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcfce7e2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fcfcf745119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fd01b55ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fd0205a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fd02036c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8b942ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8b955a3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8b955a8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8b955a9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8be1042e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8be6089609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8be5e54353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8b942ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8b955a3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8b955a8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8b955a9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8be1042e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8be6089609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8be5e54353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8b942ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f8b9522d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f8be1042e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f8be6089609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f8be5e54353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8635704897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f86369ddc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f86369e2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f86369e3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f868247ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f86874c3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f868728e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8635704897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f86369ddc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f86369e2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f86369e3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f868247ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f86874c3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f868728e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8635704897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f8636667119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f868247ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f86874c3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f868728e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7eff677ec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7eff68ac5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7eff68acaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7eff68acbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7effb4564e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7effb95ab609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7effb9376353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7eff677ec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7eff68ac5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7eff68acaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7eff68acbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7effb4564e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7effb95ab609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7effb9376353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7eff677ec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7eff6874f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7effb4564e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7effb95ab609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7effb9376353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6cdd037897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6cde310c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6cde315a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6cde316dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6d29dafe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f6d2edf6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6d2ebc1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6cdd037897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6cde310c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6cde315a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6cde316dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6d29dafe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f6d2edf6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6d2ebc1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6cdd037897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f6cddf9a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f6d29dafe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f6d2edf6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f6d2ebc1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f083c9de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f083dcb7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f083dcbca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f083dcbddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0889756e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f088e79d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f088e568353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f083c9de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f083dcb7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f083dcbca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f083dcbddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0889756e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f088e79d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f088e568353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f083c9de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f083d941119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f0889756e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f088e79d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f088e568353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1169c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb117c9bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb117ca0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb117ca1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb16373ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb168781609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb16854c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1169c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb117c9bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe134b16897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe135defc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe135df4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe135df5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe18188ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe1868d5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe1866a0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe134b16897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe135defc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe135df4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe135df5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe18188ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe1868d5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe1866a0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe134b16897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:frame #1: + 0xe32119 (0x7fe135a79119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:frame #2: + 0xd3e95 (0x7fe18188ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fe1868d5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fe1866a0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb117ca0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb117ca1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb16373ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb168781609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb16854c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1169c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fb117925119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb16373ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb168781609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb16854c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7d33a7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7d34d53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7d34d58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7d34d59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f7d807f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f7d85839609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f7d85604353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7d33a7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7d34d53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7d34d58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7d34d59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f7d807f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f7d85839609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f7d85604353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7d33a7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f7d349dd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f7d807f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f7d85839609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f7d85604353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: pipeline_state.run_communication() -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f91f3b4c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f91f4e25c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f91f4e2aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f91f4e2bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f92408c4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f924590b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f92456d6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f91f3b4c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f91f4e25c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f91f4e2aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f91f4e2bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f92408c4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f924590b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f92456d6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f91f3b4c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f91f4aaf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f92408c4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f924590b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f92456d6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efd0fd7e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efd11057c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efd1105ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efd1105ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7efd5caf6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7efd61b3d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7efd61908353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efd0fd7e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efd11057c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efd1105ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efd1105ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7efd5caf6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7efd61b3d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7efd61908353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efd0fd7e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7efd10ce1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7efd5caf6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7efd61b3d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7efd61908353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f439d4e0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f439e7b9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f439e7bea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f439e7bfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f43ea258e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f43ef29f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f43ef06a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f439d4e0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f439e7b9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f439e7bea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f439e7bfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f43ea258e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f43ef29f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f43ef06a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f439d4e0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f439e443119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f43ea258e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f43ef29f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f43ef06a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe7e13f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe7e26d1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe7e26d6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe7e26d7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fe82e170e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fe8331b7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fe832f82353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe7e13f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe7e26d1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe7e26d6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe7e26d7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fe82e170e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fe8331b7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fe832f82353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe7e13f8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fe7e235b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fe82e170e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fe8331b7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fe832f82353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdffa5ef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdffb8c8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdffb8cda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdffb8cedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe047367e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe04c3ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe04c179353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdffa5ef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdffb8c8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdffb8cda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdffb8cedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe047367e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe04c3ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe04c179353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdffa5ef897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fdffb552119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fe047367e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fe04c3ae609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fe04c179353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd433967897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd434c40c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd434c45a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd434c46dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd4806dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd485726609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd4854f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd433967897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd434c40c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd434c45a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd434c46dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd4806dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd485726609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd4854f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd433967897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fd4348ca119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fd4806dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fd485726609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fd4854f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2dae6be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2daf997c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2daf99ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2daf99ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2dfb436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2e0047d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2e00248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2dae6be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2daf997c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2daf99ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2daf99ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2dfb436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2e0047d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2e00248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2dae6be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f2daf621119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f2dfb436e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f2e0047d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f2e00248353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default0]:[rank16]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7a400c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd7a52e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd7a52eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd7a52ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd7f0d84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd7f5dcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd7f5b96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7a400c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd7a52e5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd7a52eaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd7a52ebdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd7f0d84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd7f5dcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd7f5b96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7a400c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fd7a4f6f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fd7f0d84e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fd7f5dcb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fd7f5b96353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank19]: dist.recv( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb35f376897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb36064fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb360654a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb360655dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fb3ac0eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fb3b1135609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb3b0f00353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb35f376897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb36064fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb360654a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb360655dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fb3ac0eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fb3b1135609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fb3b0f00353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb35f376897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fb3602d9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fb3ac0eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fb3b1135609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fb3b0f00353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6732320897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f67335f9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f67335fea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f67335ffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f677f098e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f67840df609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f6783eaa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6732320897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f67335f9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f67335fea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f67335ffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f677f098e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f67840df609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f6783eaa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6732320897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f6733283119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f677f098e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f67840df609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f6783eaa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c3f035897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9c4030ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9c40313a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9c40314dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f9c8bdade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9c90df4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9c90bbf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c3f035897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9c4030ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9c40313a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9c40314dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f9c8bdade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9c90df4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9c90bbf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9c3f035897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f9c3ff98119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f9c8bdade95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f9c90df4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f9c90bbf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f79b3bfa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f79b4ed3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f79b4ed8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f79b4ed9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7a00972e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f7a059b9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7a05784353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f79b3bfa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f79b4ed3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f79b4ed8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f79b4ed9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7a00972e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f7a059b9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7a05784353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f79b3bfa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f79b4b5d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f7a00972e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f7a059b9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f7a05784353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe414025897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe4152fec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe415303a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe415304dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe460d9de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe465de4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe465baf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe414025897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe4152fec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe415303a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe415304dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe460d9de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe465de4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe465baf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe414025897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fe414f88119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fe460d9de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fe465de4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fe465baf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbba48c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbba5b9cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbba5ba1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbba5ba2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fbbf163be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fbbf6682609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fbbf644d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbba48c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbba5b9cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbba5ba1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbba5ba2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fbbf163be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fbbf6682609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fbbf644d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbba48c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fbba5826119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fbbf163be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fbbf6682609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fbbf644d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8749c56897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f874af2fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f874af34a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f874af35dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f87969cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f879ba15609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f879b7e0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8749c56897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f874af2fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f874af34a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f874af35dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f87969cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f879ba15609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f879b7e0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8749c56897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f874abb9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f87969cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f879ba15609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f879b7e0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbab93f9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbaba6d2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbaba6d7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbaba6d8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fbb06171e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fbb0b1b8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fbb0af83353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbab93f9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbaba6d2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbaba6d7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbaba6d8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fbb06171e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fbb0b1b8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fbb0af83353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbab93f9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fbaba35c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fbb06171e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fbb0b1b8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fbb0af83353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd1c2fe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd1c42c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd1c42c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd1c42c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd20fd61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd214da8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd214b73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd1c2fe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd1c42c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd1c42c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd1c42c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd20fd61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd214da8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd214b73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd1c2fe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fd1c3f4c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fd20fd61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fd214da8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fd214b73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7eff14d7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7eff16053c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7eff16058a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7eff16059dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7eff61af2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7eff66b39609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7eff66904353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7eff14d7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7eff16053c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7eff16058a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7eff16059dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7eff61af2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7eff66b39609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7eff66904353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7eff14d7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7eff15cdd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7eff61af2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7eff66b39609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7eff66904353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8cc809897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8cdae2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8cdae7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8cdae8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa919581e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa91e5c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa91e393353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8cc809897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8cdae2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8cdae7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8cdae8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa919581e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa91e5c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa91e393353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8cc809897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fa8cd76c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fa919581e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fa91e5c8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fa91e393353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9973653897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f997492cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9974931a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9974932dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f99c03cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f99c5412609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f99c51dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9973653897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f997492cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9974931a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9974932dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f99c03cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f99c5412609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f99c51dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9973653897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f99745b6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f99c03cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f99c5412609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f99c51dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff09cd4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff09e024c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff09e029a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff09e02adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff0e9ac3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff0eeb0a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff0ee8d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff09cd4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff09e024c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff09e029a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff09e02adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff0e9ac3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff0eeb0a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff0ee8d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff09cd4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ff09dcae119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff0e9ac3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff0eeb0a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ff0ee8d5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c7bb9b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c7ce74c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c7ce79a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c7ce7adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1cc8913e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1ccd95a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1ccd725353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c7bb9b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c7ce74c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c7ce79a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c7ce7adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1cc8913e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1ccd95a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1ccd725353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c7bb9b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f1c7cafe119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f1cc8913e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f1ccd95a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f1ccd725353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fefeb638897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fefec911c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fefec916a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fefec917dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff0383b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff03d3f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff03d1c2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fefeb638897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fefec911c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fefec916a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fefec917dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff0383b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff03d3f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff03d1c2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fefeb638897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fefec59b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7ff0383b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7ff03d3f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7ff03d1c2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7e451aa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7e46483c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7e46488a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7e46489dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7e91f22e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7e96f69609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7e96d34353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7e451aa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7e46483c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7e46488a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7e46489dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7e91f22e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7e96f69609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7e96d34353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7e451aa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f7e4610d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f7e91f22e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f7e96f69609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f7e96d34353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f38c8c2f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f38c9f08c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f38c9f0da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f38c9f0edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f39159a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f391a9ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f391a7b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f38c8c2f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f38c9f08c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f38c9f0da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f38c9f0edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f39159a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f391a9ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f391a7b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f38c8c2f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f38c9b92119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f39159a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f391a9ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f391a7b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d703c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d7169ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d7169fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d716a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2dbd139e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f2dc2180609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f2dc1f4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d703c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d7169ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d7169fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d716a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2dbd139e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f2dc2180609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f2dc1f4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d703c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f2d71324119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f2dbd139e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f2dc2180609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f2dc1f4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0d53c7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0d54f53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0d54f58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0d54f59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0da09f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0da5a39609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0da5804353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0d53c7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0d54f53c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0d54f58a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0d54f59dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f0da09f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0da5a39609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0da5804353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0d53c7a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f0d54bdd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f0da09f2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f0da5a39609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f0da5804353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a955cf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4a968a8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4a968ada80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4a968aedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f4ae2347e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4ae738e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f4ae7159353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a955cf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4a968a8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4a968ada80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4a968aedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f4ae2347e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4ae738e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f4ae7159353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4a955cf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f4a96532119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f4ae2347e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f4ae738e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f4ae7159353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dcde8b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0dcf164c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0dcf169a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0dcf16adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f0e1ac03e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f0e1fc4a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f0e1fa15353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dcde8b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0dcf164c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0dcf169a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0dcf16adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f0e1ac03e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f0e1fc4a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f0e1fa15353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0dcde8b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f0dcedee119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f0e1ac03e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f0e1fc4a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f0e1fa15353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc588471897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc58974ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc58974fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc589750dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc5d51e9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc5da230609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc5d9ffb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc588471897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc58974ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc58974fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc589750dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc5d51e9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc5da230609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc5d9ffb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc588471897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fc5893d4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fc5d51e9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fc5da230609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fc5d9ffb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f834fde0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83510b9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83510bea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83510bfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f839cb58e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f83a1b9f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f83a196a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f834fde0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f83510b9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f83510bea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f83510bfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f839cb58e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f83a1b9f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f83a196a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f834fde0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f8350d43119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f839cb58e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f83a1b9f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f83a196a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f91d5de1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f91d70bac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f91d70bfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f91d70c0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9222b59e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f9227ba0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f922796b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f91d5de1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f91d70bac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f91d70bfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f91d70c0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9222b59e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f9227ba0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f922796b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f91d5de1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f91d6d44119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f9222b59e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f9227ba0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f922796b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7fc2acf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7fc3da8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7fc3dada80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7fc3daedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f800f847e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f801488e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f8014659353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7fc2acf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7fc3da8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7fc3dada80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7fc3daedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f800f847e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f801488e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f8014659353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7fc2acf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f7fc3a32119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f800f847e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f801488e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f8014659353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4f489de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4f49cb7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4f49cbca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4f49cbddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f4f95756e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f4f9a79d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4f9a568353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4f489de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4f49cb7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4f49cbca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4f49cbddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f4f95756e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f4f9a79d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4f9a568353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4f489de897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f4f49941119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f4f95756e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f4f9a79d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f4f9a568353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58e8316897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f58e95efc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f58e95f4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f58e95f5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f593508ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f593a0d5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5939ea0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58e8316897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f58e95efc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f58e95f4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f58e95f5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f593508ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f593a0d5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5939ea0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58e8316897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f58e9279119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f593508ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f593a0d5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f5939ea0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1aed55f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1aee838c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1aee83da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1aee83edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1b3a2d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1b3f31e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1b3f0e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1aed55f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1aee838c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1aee83da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1aee83edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1b3a2d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1b3f31e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1b3f0e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1aed55f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f1aee4c2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1b3a2d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f1b3f31e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f1b3f0e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f40f44b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f40f578dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f40f5792a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f40f5793dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f414122ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4146273609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f414603e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600077 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f40f44b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f40f578dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f40f5792a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f40f5793dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f414122ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4146273609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f414603e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f40f44b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default1]:frame #1: + 0xe32119 (0x7f40f5417119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f414122ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #3: + 0x8609 (0x7f4146273609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f414603e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f6e722897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f6f9fbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f6fa00a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f6fa01dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3fbb49ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3fc04e1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3fc02ac353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f6e722897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f6f9fbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f6fa00a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f6fa01dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3fbb49ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3fc04e1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3fc02ac353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f6e722897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f3f6f685119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f3fbb49ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f3fc04e1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f3fc02ac353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00d6658897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f00d7931c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f00d7936a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f00d7937dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f01233d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f0128417609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f01281e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00d6658897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f00d7931c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f00d7936a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f00d7937dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f01233d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f0128417609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f01281e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f00d6658897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f00d75bb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f01233d0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f0128417609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f01281e2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3cdde02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3cdf0dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3cdf0e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3cdf0e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3d2ab7ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3d2fbc1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3d2f98c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3cdde02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3cdf0dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3cdf0e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3cdf0e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3d2ab7ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3d2fbc1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3d2f98c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3cdde02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f3cded65119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f3d2ab7ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f3d2fbc1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f3d2f98c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3eaf740897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3eb0a19c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3eb0a1ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3eb0a1fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3efc4b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3f014ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3f012ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3eaf740897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3eb0a19c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3eb0a1ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3eb0a1fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3efc4b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3f014ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3f012ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3eaf740897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f3eb06a3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f3efc4b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f3f014ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f3f012ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5ae6dc1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5ae809ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5ae809fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5ae80a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5b33b39e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5b38b80609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5b3894b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5ae6dc1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5ae809ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5ae809fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5ae80a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f5b33b39e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5b38b80609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5b3894b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5ae6dc1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f5ae7d24119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f5b33b39e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f5b38b80609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f5b3894b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c181eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c194c4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c194c9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c194cadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1c64f63e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1c69faa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1c69d75353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c181eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c194c4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c194c9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c194cadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f1c64f63e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f1c69faa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f1c69d75353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c181eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f1c1914e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f1c64f63e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f1c69faa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f1c69d75353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70dd08d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70de366c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f70de36ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70de36cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7129e05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f712ee4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f712ec17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70dd08d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70de366c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f70de36ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70de36cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7129e05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f712ee4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f712ec17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70dd08d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f70ddff0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f7129e05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f712ee4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f712ec17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb23e78e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb23fa67c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb23fa6ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb23fa6ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fb28b506e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb29054d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fb290318353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb23e78e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb23fa67c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb23fa6ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb23fa6ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fb28b506e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb29054d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fb290318353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb23e78e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fb23f6f1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fb28b506e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fb29054d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fb290318353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff36e811897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff36faeac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff36faefa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff36faf0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff3bb589e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff3c05d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff3c039b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff36e811897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff36faeac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff36faefa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff36faf0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff3bb589e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff3c05d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff3c039b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff36e811897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7ff36f774119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7ff3bb589e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7ff3c05d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7ff3c039b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec75b25897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fec76dfec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fec76e03a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fec76e04dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fecc289de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fecc78e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fecc76af353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec75b25897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fec76dfec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fec76e03a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fec76e04dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fecc289de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fecc78e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fecc76af353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec75b25897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fec76a88119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fecc289de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fecc78e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fecc76af353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd6b3ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd6c6c7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd6c6cca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd6c6cddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcdb8166e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcdbd1ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcdbcf78353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd6b3ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd6c6c7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd6c6cca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd6c6cddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fcdb8166e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fcdbd1ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fcdbcf78353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd6b3ee897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fcd6c351119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fcdb8166e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fcdbd1ad609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fcdbcf78353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa12075e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa121a37c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa121a3ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa121a3ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa16d4d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa17251d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa1722e8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8388608, NumelOut=8388608, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa12075e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa121a37c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa121a3ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa121a3ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa16d4d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa17251d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa1722e8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa12075e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fa1216c1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fa16d4d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fa17251d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fa1722e8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbbdbe5a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbbdd133c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbbdd138a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbbdd139dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fbc28bd2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fbc2dc19609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fbc2d9e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbbdbe5a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbbdd133c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbbdd138a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbbdd139dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fbc28bd2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fbc2dc19609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fbc2d9e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbbdbe5a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fbbdcdbd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fbc28bd2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fbc2dc19609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fbc2d9e4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe371049897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe372322c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe372327a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe372328dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fe3bddc1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fe3c2e08609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fe3c2bd3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe371049897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe372322c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe372327a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe372328dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fe3bddc1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fe3c2e08609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fe3c2bd3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe371049897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fe371fac119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fe3bddc1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fe3c2e08609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fe3c2bd3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98115c0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9812899c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f981289ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f981289fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f985e338e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f986337f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f986314a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=65536, NumelOut=65536, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98115c0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9812899c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f981289ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f981289fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f985e338e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f986337f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f986314a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98115c0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f9812523119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f985e338e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f986337f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f986314a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -W0703 01:34:03.899000 140284826212160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873405 closing signal SIGTERM -W0703 01:34:03.899000 140284826212160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873406 closing signal SIGTERM -W0703 01:34:03.899000 140284826212160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873407 closing signal SIGTERM -W0703 01:34:03.899000 140284826212160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 873409 closing signal SIGTERM -W0703 01:34:03.904000 139929884014400 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 669522 closing signal SIGTERM -W0703 01:34:03.904000 139929884014400 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 669524 closing signal SIGTERM -W0703 01:34:03.904000 139929884014400 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 669525 closing signal SIGTERM -W0703 01:34:03.904000 139929884014400 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 669527 closing signal SIGTERM -W0703 01:34:03.904000 139929884014400 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 669528 closing signal SIGTERM -W0703 01:34:03.911000 140222101264192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1148721 closing signal SIGTERM -W0703 01:34:03.911000 140222101264192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1148723 closing signal SIGTERM -W0703 01:34:03.911000 140222101264192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1148724 closing signal SIGTERM -W0703 01:34:03.911000 140222101264192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1148725 closing signal SIGTERM -W0703 01:34:03.911000 140222101264192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1148728 closing signal SIGTERM -W0703 01:34:03.913000 139920171444032 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3897904 closing signal SIGTERM -W0703 01:34:03.913000 139920171444032 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3897908 closing signal SIGTERM -W0703 01:34:03.914000 139920171444032 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3897910 closing signal SIGTERM -W0703 01:34:03.923000 140380777928512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1424625 closing signal SIGTERM -W0703 01:34:03.923000 140380777928512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1424626 closing signal SIGTERM -W0703 01:34:03.923000 140380777928512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1424628 closing signal SIGTERM -W0703 01:34:03.924000 140380777928512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1424630 closing signal SIGTERM -W0703 01:34:03.968000 140016183863104 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3769227 closing signal SIGTERM -W0703 01:34:03.969000 140016183863104 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3769229 closing signal SIGTERM -E0703 01:34:04.013000 140303926261568 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1783593) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 01:34:04.055000 140692406683456 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 886866) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:34:03 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1783594) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1783594 -[2]: - time : 2024-07-03_01:34:03 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1783595) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1783595 -[3]: - time : 2024-07-03_01:34:03 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1783596) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1783596 -[4]: - time : 2024-07-03_01:34:03 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1783597) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1783597 -[5]: - time : 2024-07-03_01:34:03 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1783598) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1783598 -[6]: - time : 2024-07-03_01:34:03 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1783599) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1783599 -[7]: - time : 2024-07-03_01:34:03 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1783600) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1783600 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:34:03 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1783593) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1783593 -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 886867) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 886867 -[2]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 886868) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 886868 -[3]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 886869) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 886869 -[4]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 886870) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 886870 -[5]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 886871) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 886871 -[6]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 886872) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 886872 -[7]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 886873) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 886873 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 886866) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 886866 -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -E0703 01:34:04.649000 140016183863104 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3769224) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -W0703 01:34:04.769000 140016183863104 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3769151_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:04.798000 140016183863104 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3769151_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:04.823000 140016183863104 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3769151_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-102.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 3769225) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3769225 -[2]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-102.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 3769226) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3769226 -[3]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-102.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 3769228) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3769228 -[4]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-102.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 3769230) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3769230 -[5]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-102.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 3769231) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3769231 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-102.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 3769224) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3769224 -============================================================ -E0703 01:34:04.833000 139920171444032 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3897903) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 01:34:04.900000 139920171444032 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3897830_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:04.936000 139920171444032 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3897830_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:04.956000 139920171444032 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3897830_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-62.ec2.internal - rank : 50 (local_rank: 2) - exitcode : -6 (pid: 3897905) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3897905 -[2]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : -6 (pid: 3897906) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3897906 -[3]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 3897907) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3897907 -[4]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 3897909) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3897909 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:34:03 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 3897903) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3897903 -============================================================ -E0703 01:34:05.099000 140222101264192 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 1148722) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 01:34:05.115000 140222101264192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1148648_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 01:34:05.117000 140284826212160 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 873402) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 01:34:05.128000 140380777928512 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1424624) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 01:34:05.133000 140284826212160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_873329_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:05.143000 140380777928512 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1424550_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:05.144000 140222101264192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1148648_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:05.159000 140222101264192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1148648_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -W0703 01:34:05.161000 140284826212160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_873329_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-78.ec2.internal - rank : 37 (local_rank: 5) - exitcode : -6 (pid: 1148726) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1148726 -[2]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-78.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 1148727) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1148727 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-78.ec2.internal - rank : 33 (local_rank: 1) - exitcode : -6 (pid: 1148722) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1148722 -============================================================ -W0703 01:34:05.170000 140380777928512 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1424550_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:05.178000 140284826212160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_873329_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 873403) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 873403 -[2]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 873404) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 873404 -[3]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 873408) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 873408 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 873402) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 873402 -============================================================ -W0703 01:34:05.187000 140380777928512 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1424550_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-153.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 1424627) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1424627 -[2]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-153.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 1424629) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1424629 -[3]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-153.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 1424631) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1424631 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-153.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 1424624) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1424624 -============================================================ -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -E0703 01:34:05.428000 139929884014400 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 669523) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 01:34:05.443000 139929884014400 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_669449_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:05.470000 139929884014400 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_669449_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:34:05.484000 139929884014400 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_669449_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-138.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 669526) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 669526 -[2]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-138.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 669529) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 669529 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:34:03 - host : ip-26-0-161-138.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 669523) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 669523 -============================================================ -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 4: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -srun: error: ip-26-0-161-138: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-16/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/bench.slurm deleted file mode 100644 index cf90c18a57940b14cd4e8819346894a3615f32ca..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/config.yaml deleted file mode 100644 index d5ec7668e9a209caa9f44ae0fd70244208c6cdf6..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 256 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 2 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/log.out deleted file mode 100644 index a657d35d160cee9406aad5f83075d464e7ce77fe..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/log.out +++ /dev/null @@ -1,5731 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:46:39 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:46:42.935000 140525533763392 torch/distributed/run.py:757] -W0703 03:46:42.935000 140525533763392 torch/distributed/run.py:757] ***************************************** -W0703 03:46:42.935000 140525533763392 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:46:42.935000 140525533763392 torch/distributed/run.py:757] ***************************************** -W0703 03:46:45.687000 139742687819584 torch/distributed/run.py:757] -W0703 03:46:45.687000 139742687819584 torch/distributed/run.py:757] ***************************************** -W0703 03:46:45.687000 139742687819584 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:46:45.687000 139742687819584 torch/distributed/run.py:757] ***************************************** -W0703 03:46:45.733000 140168436774720 torch/distributed/run.py:757] -W0703 03:46:45.733000 140168436774720 torch/distributed/run.py:757] ***************************************** -W0703 03:46:45.733000 140168436774720 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:46:45.733000 140168436774720 torch/distributed/run.py:757] ***************************************** -W0703 03:46:45.777000 139873423791936 torch/distributed/run.py:757] -W0703 03:46:45.777000 139873423791936 torch/distributed/run.py:757] ***************************************** -W0703 03:46:45.777000 139873423791936 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:46:45.777000 139873423791936 torch/distributed/run.py:757] ***************************************** -W0703 03:46:45.780000 140682037024576 torch/distributed/run.py:757] -W0703 03:46:45.780000 140682037024576 torch/distributed/run.py:757] ***************************************** -W0703 03:46:45.780000 140682037024576 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:46:45.780000 140682037024576 torch/distributed/run.py:757] ***************************************** -W0703 03:46:46.058000 140174200911680 torch/distributed/run.py:757] -W0703 03:46:46.058000 140174200911680 torch/distributed/run.py:757] ***************************************** -W0703 03:46:46.058000 140174200911680 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:46:46.058000 140174200911680 torch/distributed/run.py:757] ***************************************** -W0703 03:46:46.135000 140031935858496 torch/distributed/run.py:757] -W0703 03:46:46.135000 140031935858496 torch/distributed/run.py:757] ***************************************** -W0703 03:46:46.135000 140031935858496 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:46:46.135000 140031935858496 torch/distributed/run.py:757] ***************************************** -W0703 03:46:46.202000 140183913641792 torch/distributed/run.py:757] -W0703 03:46:46.202000 140183913641792 torch/distributed/run.py:757] ***************************************** -W0703 03:46:46.202000 140183913641792 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:46:46.202000 140183913641792 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:47:11 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config: -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: run='%date_%jobid', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: step=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: consumed_train_samples=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: benchmark_csv_path=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp=2, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp=16, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp_engine=, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_mode=, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: expert_parallel_size=1), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_revision=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_max_length=None), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoint_interval=100000, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: save_initial_state=False, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: log_level_replica='info', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: train_steps=20, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: micro_batch_size=2, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: batch_accumulation_per_replica=256, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: val_check_interval=-1, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_val_batches=0, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_test_batches=0), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta1=0.9, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta2=0.95, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: name='adamW'), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: zero_stage=1, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: weight_decay=0.01, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: clip_grad=1.0, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_steps=1, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_style='linear', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_style='linear', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_steps=19, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: start_training_step=1, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_splits='train', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: text_column_name='text'), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_loading_workers=0))], -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2')), -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lighteval=None) -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Model Config: -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272) -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Building model.. -[default0]:07/03/2024 03:47:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Setting PP block ranks... -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=13|ip-26-0-173-202]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=13|ip-26-0-173-202]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=13|ip-26-0-173-202]: No checkpoint path provided. -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=14|ip-26-0-173-202]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=14|ip-26-0-173-202]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=14|ip-26-0-173-202]: No checkpoint path provided. -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=15|ip-26-0-173-202]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=15|ip-26-0-173-202]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=15|ip-26-0-173-202]: No checkpoint path provided. -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=8|ip-26-0-173-202]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=8|ip-26-0-173-202]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=8|ip-26-0-173-202]: No checkpoint path provided. -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=10|ip-26-0-173-202]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=10|ip-26-0-173-202]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=10|ip-26-0-173-202]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=9|ip-26-0-173-202]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=9|ip-26-0-173-202]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=9|ip-26-0-173-202]: No checkpoint path provided. -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=12|ip-26-0-173-202]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=12|ip-26-0-173-202]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=11|ip-26-0-173-202]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=11|ip-26-0-173-202]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=11|ip-26-0-173-202]: No checkpoint path provided. -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=12|ip-26-0-173-202]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Parametrizing model parameters using StandardParametrizator -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=0|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=3|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=1|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=2|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=5|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=4|ip-26-0-166-125]: No checkpoint path provided. -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=6|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: No checkpoint path provided. -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=1|TP=7|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 03:47:29 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=10|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=13|ip-26-0-174-36]: No checkpoint path provided. -[default2]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=2|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=9|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=0|ip-26-0-173-246]: No checkpoint path provided. -[default4]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=4|ip-26-0-173-246]: No checkpoint path provided. -[default3]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=11|ip-26-0-174-36]: No checkpoint path provided. -[default4]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=12|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=8|ip-26-0-174-36]: No checkpoint path provided. -[default3]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=3|ip-26-0-173-246]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=1|ip-26-0-173-246]: No checkpoint path provided. -[default7]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=15|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=14|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=5|ip-26-0-173-246]: No checkpoint path provided. -[default6]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=6|ip-26-0-173-246]: No checkpoint path provided. -[default7]:07/03/2024 03:47:29 [INFO|DP=1|PP=1|TP=7|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=2|ip-26-0-164-207]: No checkpoint path provided. -[default5]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=5|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=3|ip-26-0-164-207]: No checkpoint path provided. -[default4]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=4|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=6|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=0|ip-26-0-164-207]: No checkpoint path provided. -[default1]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=1|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 03:47:29 [INFO|DP=1|PP=0|TP=7|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 03:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 03:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 03:47:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:47:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Using `datasets` library -[default0]:07/03/2024 03:47:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 03:47:32 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:47:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:47:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:47:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: -[default0]:07/03/2024 03:47:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Start training] datetime: 2024-07-03 03:47:34.680247 | mbs: 2 | grad_accum: 256 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:47:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:47:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default5]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=13|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=14|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=15|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=8|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=12|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=9|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=2|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=10|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=9|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=12|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=4|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=0|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=1|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=1|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=3|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=2|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=4|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=5|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=15|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=7|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=6|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=7|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=5|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=3|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=2|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=6|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=0|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=7|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=10|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=11|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=13|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=11|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=8|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:47:34 [WARNING|DP=0|PP=0|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:47:34 [WARNING|DP=0|PP=1|TP=0|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=5|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:47:34 [WARNING|DP=1|PP=1|TP=14|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=4|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:47:34 [WARNING|DP=1|PP=0|TP=1|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:47:35 [WARNING|DP=1|PP=1|TP=3|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:47:35 [WARNING|DP=0|PP=1|TP=6|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 7[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -5, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: Traceback (most recent call last): -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d40c49897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5d41f22c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5d41f27a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5d41f28dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5d8d9c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5d92a08609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f5d927d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d40c49897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5d41f22c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5d41f27a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5d41f28dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5d8d9c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5d92a08609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f5d927d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5d40c49897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f5d41bac119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f5d8d9c1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f5d92a08609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f5d927d3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: trainer.train(dataloader) -[default6]:[rank22]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: torch.autograd.backward( -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default4]:[rank20]: torch.autograd.backward( -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f011c603897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f011d8dcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f011d8e1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f011d8e2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f016937be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f016e3c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f016e18d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f011c603897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f011d8dcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f011d8e1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f011d8e2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f016937be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f016e3c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f016e18d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f011c603897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f011d566119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f016937be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f016e3c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f016e18d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd33a3e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd33b6bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd33b6c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd33b6c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fd38715ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fd38c1a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fd38bf6e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd33a3e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd33b6bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd33b6c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd33b6c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fd38715ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fd38c1a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fd38bf6e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd33a3e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fd33b347119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fd38715ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fd38c1a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fd38bf6e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff465129897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff466402c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff466407a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff466408dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7ff4b1ea1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7ff4b6ee8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7ff4b6cb3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff465129897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff466402c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::wat[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21330b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f213438dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2134392a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2134393dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f217fe2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2184e73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2184c3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21330b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f213438dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2134392a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2134393dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f217fe2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2184e73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2184c3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21330b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #1: + 0xe32119 (0x7f2134017119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f217fe2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f2184e73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe8022f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #4: clone + 0x43 (0x7f2184c3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe8035cec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe8035d3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe8035d4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe84f06de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe8540b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe853e7f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe8022f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe8035cec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe8035d3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe8035d4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe84f06de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe8540b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe853e7f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe8022f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fe803258119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fe84f06de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fe8540b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fe853e7f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -chdogHandler() + 0x1a0 (0x7ff466407a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff466408dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7ff4b1ea1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7ff4b6ee8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7ff4b6cb3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff465129897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7ff46608c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7ff4b1ea1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7ff4b6ee8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7ff4b6cb3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default0]:[rank0]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: dist.recv( -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f17d8875897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f17d9b4ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f17d9b53a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f17d9b54dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f18255ede95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f182a634609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f182a3ff353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f17d8875897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f17d9b4ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f17d9b53a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f17d9b54dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f18255ede95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f182a634609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f182a3ff353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f17d8875897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f17d97d8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f18255ede95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f182a634609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f182a3ff353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank30]: dist.recv( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34f9e8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f34fb167c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f34fb16ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f34fb16ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3546c06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f354bc4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f354ba18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34f9e8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f34fb167c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f34fb16ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f34fb16ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3546c06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f354bc4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff84ae20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #6: clone + 0x43 (0x7f354ba18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34f9e8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f34fadf1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff84c0f9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff84c0fea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff84c0ffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f3546c06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #4: + 0xd3e95 (0x7ff897b98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff89cbdf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff89c9aa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #3: + 0x8609 (0x7f354bc4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f354ba18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff84ae20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff84c0f9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff84c0fea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff84c0ffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff897b98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff89cbdf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff89c9aa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff84ae20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7ff84bd83119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7ff897b98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7ff89cbdf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7ff89c9aa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f476e6b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f476f991c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f476f996a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f476f997dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f47bb430e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f47c0477609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f47c0242353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f476e6b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f476f991c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f476f996a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f476f997dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #4: + 0xd3e95 (0x7f47bb430e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f47c0477609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72d20ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #6: clone + 0x43 (0x7f47c0242353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f72d3387c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f72d338ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f72d338ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f731ee26e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]: -[default7]:frame #5: + 0x8609 (0x7f7323e6d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f476e6b8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f476f61b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #6: clone + 0x43 (0x7f7323c38353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72d20ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #2: + 0xd3e95 (0x7f47bb430e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f72d3387c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f72d338ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f72d338ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f731ee26e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f47c0477609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #5: + 0x8609 (0x7f7323e6d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f47c0242353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:frame #6: clone + 0x43 (0x7f7323c38353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72d20ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f72d3011119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f731ee26e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f7323e6d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f7323c38353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd16e499897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd16f772c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd16f777a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd16f778dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd1bb211e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd1c0258609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd1c0023353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd16e499897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd16f772c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd16f777a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd16f778dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd1bb211e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd1c0258609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd1c0023353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd16e499897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fd16f3fc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fd1bb211e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fd1c0258609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fd1c0023353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f52c95ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f52ca885c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f52ca88aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f52ca88bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5316324e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f531b36b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f531b136353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f52c95ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f52ca885c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f52ca88aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f52ca88bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5316324e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f531b36b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f531b136353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f52c95ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f52ca50f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f5316324e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f531b36b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f531b136353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f383e98b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f383fc64c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f383fc69a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f383fc6adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f388b703e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f389074a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3890515353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f383e98b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f383fc64c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f383fc69a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f383fc6adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f388b703e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f389074a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3890515353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f383e98b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f383f8ee119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f388b703e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f389074a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f3890515353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default0]:[rank32]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: output = model(**micro_batch) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: pipeline_state.run_communication() -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: dist.recv( -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21ac498897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f21ad771c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f21ad776a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f21ad777dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f21f9210e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f21fe257609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f21fe022353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21ac498897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f21ad771c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f21ad776a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f21ad777dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f21f9210e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f21fe257609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f21fe022353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f21ac498897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f21ad3fb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f21f9210e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f21fe257609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f21fe022353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank24]: dist.recv( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3aa69b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3aa7c90c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3aa7c95a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3aa7c96dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3af372fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3af8776609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3af8541353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3aa69b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3aa7c90c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3aa7c95a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3aa7c96dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f3af372fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f3af8776609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f3af8541353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3aa69b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f3aa791a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f3af372fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f3af8776609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f3af8541353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e1d750897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3e1ea29c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3e1ea2ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3e1ea2fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3e6a4c8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3e6f50f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3e6f2da353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e1d750897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3e1ea29c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3e1ea2ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3e1ea2fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3e6a4c8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3e6f50f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3e6f2da353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3e1d750897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f3e1e6b3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f3e6a4c8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f3e6f50f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f3e6f2da353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff9c8e7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff9ca155c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff9ca15aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff9ca15bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ffa15bf4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ffa1ac3b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ffa1aa06353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff9c8e7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff9ca155c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff9ca15aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff9ca15bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ffa15bf4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ffa1ac3b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ffa1aa06353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff9c8e7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7ff9c9ddf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7ffa15bf4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7ffa1ac3b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7ffa1aa06353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4179c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb418c9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb418ca4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb418ca5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb46473ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb469785609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb469550353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4179c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb418c9fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb418ca4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb418ca5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb46473ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb469785609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb469550353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4179c6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fb418929119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fb46473ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fb469785609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fb469550353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4a949a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4aa773c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4aa778a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4aa779dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa4f6212e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa4fb259609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa4fb024353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4a949a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4aa773c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4aa778a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4aa779dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa4f6212e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa4fb259609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa4fb024353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4a949a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fa4aa3fd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fa4f6212e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fa4fb259609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fa4fb024353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70e6ce0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70e7fb9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f70e7fbea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70e7fbfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f7133a58e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f7138a9f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f713886a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70e6ce0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f70e7fb9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f70e7fbea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f70e7fbfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f7133a58e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f7138a9f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f713886a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f70e6ce0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f70e7c43119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f7133a58e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f7138a9f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f713886a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffa56a16897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffa57cefc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffa57cf4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffa57cf5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ffaa378ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ffaa87d5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ffaa85a0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffa56a16897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffa57cefc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffa57cf4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffa57cf5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ffaa378ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ffaa87d5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ffaa85a0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffa56a16897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ffa57979119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ffaa378ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ffaa87d5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ffaa85a0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f313c59a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f313d873c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f313d878a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f313d879dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3189312e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f318e359609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f318e124353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f313c59a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f313d873c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f313d878a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f313d879dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3189312e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f318e359609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f318e124353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f313c59a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f313d4fd119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f3189312e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f318e359609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f318e124353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default4]:[rank60]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f918a5e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f918b8bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f918b8c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f918b8c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f91d735ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f91dc3a5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f91dc170353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f918a5e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f918b8bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f918b8c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f918b8c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f91d735ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f91dc3a5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f91dc170353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f918a5e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f918b549119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f91d735ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f91dc3a5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f91dc170353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6478111897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f64793eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f64793efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f64793f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f64c4e89e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f64c9ed0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f64c9c9b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6478111897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f64793eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f64793efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f64793f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f64c4e89e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f64c9ed0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f64c9c9b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6478111897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f6479074119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f64c4e89e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f64c9ed0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f64c9c9b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6e63d24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6e64ffdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6e65002a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6e65003dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6eb0a9ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6eb5ae3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6eb58ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6e63d24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6e64ffdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6e65002a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6e65003dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f6eb0a9ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f6eb5ae3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f6eb58ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6e63d24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f6e64c87119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc461d1589[default6]:frame #2: + 0xd3e95 (0x7f6eb0a9ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f6eb5ae3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f6eb58ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -7 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc462feec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc462ff3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc462ff4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc4aea8de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc4b3ad4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc4b389f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc461d15897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc462feec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc462ff3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc462ff4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc4aea8de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc4b3ad4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc4b389f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc461d15897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fc462c78119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fc4aea8de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fc4b3ad4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fc4b389f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank26]: dist.recv( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f232168a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2322963c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2322968a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2322969dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f236e402e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2373449609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2373214353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f232168a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2322963c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2322968a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2322969dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f236e402e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2373449609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2373214353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f232168a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f23225ed119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f236e402e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f2373449609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f2373214353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd124cd6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd125fafc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd125fb4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd125fb5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd171a4ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd176a95609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd176860353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd124cd6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd125fafc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd125fb4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd125fb5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd171a4ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd176a95609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd176860353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd124cd6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fd125c39119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fd171a4ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fd176a95609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fd176860353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f864b0b6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f864c38fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f864c394a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f864c395dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f8697e2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f869ce75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f869cc40353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f864b0b6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f864c38fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f864c394a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f864c395dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f8697e2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f869ce75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f869cc40353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f864b0b6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f864c019119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f8697e2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f869ce75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f869cc40353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac13249897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fac14522c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fac14527a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fac14528dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank19]: dist.recv( -[default2]:frame #4: + 0xd3e95 (0x7fac5ffc1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:frame #5: + 0x8609 (0x7fac65008609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fac64dd3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac13249897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fac14522c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fac14527a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fac14528dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fac5ffc1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e26478897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e27751c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e27756a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #5: + 0x8609 (0x7fac65008609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e27757dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f2e731f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f2e78237609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f2e78002353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e26478897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e27751c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e27756a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e27757dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f2e731f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #6: clone + 0x43 (0x7fac64dd3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #5: + 0x8609 (0x7f2e78237609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f2e78002353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac13249897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e26478897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f2e273db119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f2e731f0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f2e78237609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f2e78002353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:frame #1: + 0xe32119 (0x7fac141ac119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fac5ffc1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fac65008609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fac64dd3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7ccbcfe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7cccfd7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7cccfdca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7cccfdddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7d18a76e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f7d1dabd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7d1d888353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7ccbcfe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7cccfd7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7cccfdca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7cccfdddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7d18a76e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #5: + 0x8609 (0x7f7d1dabd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f7d1d888353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff41a66c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff41b945c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff41b94aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff41b94bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7ccbcfe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f7cccc61119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f7d18a76e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #4: + 0xd3e95 (0x7ff4673e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:frame #3: + 0x8609 (0x7f7d1dabd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:frame #5: + 0x8609 (0x7ff46c42b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff46c1f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #4: clone + 0x43 (0x7f7d1d888353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff41a66c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff41b945c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbdd4a48897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff41b94aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff41b94bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff4673e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff46c42b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff46c1f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff41a66c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ff41b5cf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbdd5d21c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff4673e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff46c42b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ff46c1f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbdd5d26a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]: -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbdd5d27dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fbe217c0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fbe26807609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fbe265d2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbdd4a48897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbdd5d21c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbdd5d26a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbdd5d27dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fbe217c0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fbe26807609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fbe265d2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbdd4a48897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fbdd59ab119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fbe217c0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fbe26807609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fbe265d2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa309b62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa30ae3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa30ae40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa30ae41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa3568dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa35b921609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa35b6ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa309b62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa30ae3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa30ae40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa30ae41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa3568dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa35b921609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fa35b6ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa309b62897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fa30aac5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fa3568dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fa35b921609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fa35b6ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f575eace897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f575fda7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f575fdaca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f575fdaddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f57ab846e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f57b088d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f57b0658353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f575eace897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f575fda7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f575fdaca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f575fdaddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f57ab846e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f57b088d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f57b0658353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f575eace897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f575fa31119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f57ab846e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f57b088d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f57b0658353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f6e5a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f6f87ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f6f883a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f6f884dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2fbb31de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2fc0364609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2fc012f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f6e5a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f6f87ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f6f883a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f6f884dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2fbb31de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2fc0364609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2fc012f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f6e5a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f2f6f508119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f2fbb31de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f2fc0364609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f2fc012f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f081f040897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0820319c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f082031ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f082031fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f086bdb8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0870dff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0870bca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f081f040897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0820319c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f082031ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f082031fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f086bdb8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0870dff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0870bca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f081f040897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f081ffa3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f086bdb8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f0870dff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f0870bca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe6fbd1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe6fcff6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe6fcffba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe6fcffcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe748a95e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe74dadc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe74d8a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe6fbd1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe6fcff6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe6fcffba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe6fcffcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe748a95e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe74dadc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe74d8a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe6fbd1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fe6fcc80119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fe748a95e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fe74dadc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fe74d8a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd6f54b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd6f6790c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd6f6795a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd6f6796dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd74222fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd747276609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd747041353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd6f54b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd6f6790c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd6f6795a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd6f6796dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd74222fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd747276609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd747041353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd6f54b7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fd6f641a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fd74222fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fd747276609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fd747041353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c3ac8b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7c3bf64c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7c3bf69a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7c3bf6adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7c87a03e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f7c8ca4a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f7c8c815353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c3ac8b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7c3bf64c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7c3bf69a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7c3bf6adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7c87a03e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f7c8ca4a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f7c8c815353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7c3ac8b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f7c3bbee119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f7c87a03e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f7c8ca4a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f7c8c815353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: output = model(**micro_batch) -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: sharded_logits = self.model( -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f37d0990897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f37d1c69c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f37d1c6ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f37d1c6fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f381d708e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f382274f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f382251a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f37d0990897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f37d1c69c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f37d1c6ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f37d1c6fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f381d708e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f382274f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f382251a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f37d0990897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f37d18f3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f381d708e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f382274f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f382251a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4288c0d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4289ee6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4289eeba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4289eecdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f42d5985e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f42da9cc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f42da797353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4288c0d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4289ee6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4289eeba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4289eecdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f42d5985e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f42da9cc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f42da797353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4288c0d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f4289b70119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f42d5985e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f42da9cc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f42da797353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feba0089897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feba1362c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feba1367a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feba1368dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7febece01e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7febf1e48609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7febf1c13353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feba0089897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feba1362c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feba1367a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feba1368dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7febece01e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7febf1e48609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7febf1c13353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feba0089897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7feba0fec119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7febece01e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7febf1e48609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7febf1c13353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff6906b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff69198dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff691992a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff691993dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff6dd42ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff6e2473609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff6e223e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff6906b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff69198dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff691992a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff691993dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ff6dd42ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ff6e2473609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ff6e223e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff6906b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7ff691617119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7ff6dd42ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7ff6e2473609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7ff6e223e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84225df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84238b8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84238bda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84238bedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f846f357e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f847439e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8474169353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84225df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f84238b8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f84238bda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f84238bedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f846f357e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f847439e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8474169353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f84225df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f8423542119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f846f357e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f847439e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f8474169353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc847569897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc848842c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc848847a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc848848dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc8942e1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc899328609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fc8990f3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc847569897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc848842c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc848847a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc848848dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fc8942e1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fc899328609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fc8990f3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc847569897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fc8484cc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fc8942e1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fc899328609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fc8990f3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b3eb0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b3fde3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b3fde8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b3fde9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1b8b882e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1b908c9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1b90694353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b3eb0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b3fde3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b3fde8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b3fde9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f1b8b882e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f1b908c9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f1b90694353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b3eb0a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f1b3fa6d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f1b8b882e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f1b908c9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f1b90694353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd30c9f3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd30dcccc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd30dcd1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd30dcd2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd35976be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd35e7b2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd35e57d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd30c9f3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd30dcccc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd30dcd1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd30dcd2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd35976be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd35e7b2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd35e57d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd30c9f3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fd30d956119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fd35976be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fd35e7b2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fd35e57d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0264613897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02658ecc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02658f1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02658f2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f02b138be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f02b63d2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f02b619d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0264613897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02658ecc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02658f1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02658f2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f02b138be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f02b63d2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f02b619d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0264613897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f0265576119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f02b138be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f02b63d2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f02b619d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea06069897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea07342c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea07347a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea07348dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fea52de1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fea57e28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fea57bf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea06069897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea07342c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea07347a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea07348dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fea52de1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fea57e28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fea57bf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea06069897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fea06fcc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fea52de1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fea57e28609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fea57bf3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98b61ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f98b7484c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f98b7489a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f98b748adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9902f23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f9907f6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f9907d35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98b61ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f98b7484c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f98b7489a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f98b748adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f9902f23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f9907f6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f9907d35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98b61ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f98b710e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f9902f23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f9907f6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f9907d35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0b187ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0b19a87c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0b19a8ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0b19a8ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f0b65526e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0b6a56d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0b6a338353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0b187ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0b19a87c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0b19a8ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0b19a8ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f0b65526e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f0b6a56d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f0b6a338353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0b187ae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f0b19711119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f0b65526e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f0b6a56d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f0b6a338353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f56e484a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f56e5b23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f56e5b28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f56e5b29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f57315c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5736609609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f57363d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f56e484a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f56e5b23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f56e5b28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f56e5b29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f57315c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5736609609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f57363d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f56e484a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f56e57ad119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f57315c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f5736609609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f57363d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1eee511897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1eef7eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1eef7efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1eef7f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1f3b289e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1f402d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1f4009b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1eee511897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1eef7eac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1eef7efa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1eef7f0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1f3b289e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1f402d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1f4009b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1eee511897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f1eef474119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1f3b289e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f1f402d0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f1f4009b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fabb9172897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fabba44bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fabba450a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fabba451dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fac05eeae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fac0af31609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fac0acfc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fabb9172897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fabba44bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fabba450a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fabba451dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fac05eeae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fac0af31609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fac0acfc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fabb9172897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fabba0d5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fac05eeae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fac0af31609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fac0acfc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1199760897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f119aa39c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f119aa3ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f119aa3fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f11e64d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f11eb51f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default2]:frame #6: clone + 0x43 (0x7f11eb2ea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1199760897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f119aa39c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f119aa3ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb8d8231897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb8d950ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f119aa3fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f11e64d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb8d950fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb8d9510dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fb924fa9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb929ff0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fb929dbb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:frame #5: + 0x8609 (0x7f11eb51f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f11eb2ea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1199760897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f119a6c3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #2: + 0xd3e95 (0x7f11e64d8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f11eb51f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f11eb2ea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb8d8231897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb8d950ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb8d950fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb8d9510dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: -[default5]:frame #4: + 0xd3e95 (0x7fb924fa9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb929ff0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fb929dbb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb8d8231897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fb8d9194119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fb924fa9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fb929ff0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fb929dbb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f623c608897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f623d8e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f623d8e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f623d8e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f6289380e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f628e3c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f628e192353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f623c608897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f623d8e1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f623d8e6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f623d8e7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f6289380e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f628e3c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f628e192353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f623c608897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f623d56b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f6289380e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f628e3c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f628e192353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f123dfde897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f123f2b7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f123f2bca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f123f2bddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f128ad56e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f128fd9d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f128fb68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f123dfde897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f123f2b7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f123f2bca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f123f2bddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f128ad56e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f128fd9d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f128fb68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f123dfde897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f123ef41119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f128ad56e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f128fd9d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f128fb68353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc36e706897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc36f9dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc36f9e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc36f9e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d189d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #4: + 0xd3e95 (0x7fc3bb47ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc3c04c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d19cadc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d19cb2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #6: clone + 0x43 (0x7fc3c0290353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d19cb3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #4: + 0xd3e95 (0x7f2d6574ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc36e706897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc36f9dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc36f9e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc36f9e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fc3bb47ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fc3c04c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fc3c0290353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc36e706897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #5: + 0x8609 (0x7f2d6a793609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2d6a55e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:frame #1: + 0xe32119 (0x7fc36f669119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fc3bb47ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]:frame #3: + 0x8609 (0x7fc3c04c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fc3c0290353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d189d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]: -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d19cadc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d19cb2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d19cb3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2d6574ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2d6a793609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2d6a55e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d189d4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f2d19937119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f2d6574ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f2d6a793609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f2d6a55e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9907887897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9908b60c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9908b65a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9908b66dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f99545ffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9959646609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9959411353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=1048576, NumelOut=1048576, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9907887897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9908b60c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9908b65a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9908b66dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f99545ffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9959646609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9959411353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9907887897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f99087ea119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f99545ffe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f9959646609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f9959411353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -W0703 03:57:57.892000 140525533763392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 28873 closing signal SIGTERM -W0703 03:57:57.892000 140525533763392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 28874 closing signal SIGTERM -W0703 03:57:57.892000 140525533763392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 28875 closing signal SIGTERM -W0703 03:57:57.892000 140525533763392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 28876 closing signal SIGTERM -W0703 03:57:57.892000 140525533763392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 28878 closing signal SIGTERM -W0703 03:57:57.893000 140525533763392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 28879 closing signal SIGTERM -W0703 03:57:57.893000 140525533763392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 28880 closing signal SIGTERM -W0703 03:57:57.936000 140174200911680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1311137 closing signal SIGTERM -W0703 03:57:57.936000 140174200911680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1311138 closing signal SIGTERM -W0703 03:57:57.937000 140174200911680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1311139 closing signal SIGTERM -W0703 03:57:57.937000 140174200911680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1311141 closing signal SIGTERM -W0703 03:57:57.937000 140174200911680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1311142 closing signal SIGTERM -W0703 03:57:57.937000 140174200911680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1311143 closing signal SIGTERM -W0703 03:57:57.937000 140174200911680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1311144 closing signal SIGTERM -W0703 03:57:58.001000 139873423791936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 800121 closing signal SIGTERM -W0703 03:57:58.001000 139873423791936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 800122 closing signal SIGTERM -W0703 03:57:58.001000 139873423791936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 800123 closing signal SIGTERM -W0703 03:57:58.001000 139873423791936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 800124 closing signal SIGTERM -W0703 03:57:58.001000 139873423791936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 800125 closing signal SIGTERM -W0703 03:57:58.001000 139873423791936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 800126 closing signal SIGTERM -W0703 03:57:58.001000 139873423791936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 800128 closing signal SIGTERM -W0703 03:57:58.023000 140168436774720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 328952 closing signal SIGTERM -W0703 03:57:58.024000 140168436774720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 328953 closing signal SIGTERM -W0703 03:57:58.024000 140168436774720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 328954 closing signal SIGTERM -W0703 03:57:58.024000 140168436774720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 328955 closing signal SIGTERM -W0703 03:57:58.024000 140168436774720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 328957 closing signal SIGTERM -W0703 03:57:58.024000 140168436774720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 328958 closing signal SIGTERM -W0703 03:57:58.024000 140168436774720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 328959 closing signal SIGTERM -E0703 03:57:58.147000 139742687819584 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1666398) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper -E0703 03:57:58.215000 140183913641792 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 842708) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:57:58 - host : ip-26-0-162-233.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1666399) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1666399 -[2]: - time : 2024-07-03_03:57:58 - host : ip-26-0-162-233.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1666400) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1666400 -[3]: - time : 2024-07-03_03:57:58 - host : ip-26-0-162-233.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1666401) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1666401 -[4]: - time : 2024-07-03_03:57:58 - host : ip-26-0-162-233.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1666402) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1666402 -[5]: - time : 2024-07-03_03:57:58 - host : ip-26-0-162-233.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1666403) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1666403 -[6]: - time : 2024-07-03_03:57:58 - host : ip-26-0-162-233.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1666404) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1666404 -[7]: - time : 2024-07-03_03:57:58 - host : ip-26-0-162-233.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1666405) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1666405 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:57:58 - host : ip-26-0-162-233.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1666398) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1666398 -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:57:58 - host : ip-26-0-174-36.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 842709) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 842709 -[2]: - time : 2024-07-03_03:57:58 - host : ip-26-0-174-36.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 842710) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 842710 -[3]: - time : 2024-07-03_03:57:58 - host : ip-26-0-174-36.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 842711) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 842711 -[4]: - time : 2024-07-03_03:57:58 - host : ip-26-0-174-36.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 842712) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 842712 -[5]: - time : 2024-07-03_03:57:58 - host : ip-26-0-174-36.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 842713) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 842713 -[6]: - time : 2024-07-03_03:57:58 - host : ip-26-0-174-36.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 842714) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 842714 -[7]: - time : 2024-07-03_03:57:58 - host : ip-26-0-174-36.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 842715) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 842715 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:57:58 - host : ip-26-0-174-36.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 842708) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 842708 -============================================================ -srun: error: ip-26-0-174-36: task 7: Exited with exit code 1 -srun: error: ip-26-0-162-233: task 0: Exited with exit code 1 -E0703 03:58:00.046000 140525533763392 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 4 (pid: 28877) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:58:00.062000 140525533763392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_28800_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:00.091000 140525533763392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_28800_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:00.099000 140525533763392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_28800_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:57:57 - host : ip-26-0-166-125.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 28877) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 28877 -============================================================ -E0703 03:58:00.207000 139873423791936 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 6 (pid: 800127) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:58:00.223000 139873423791936 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_800047_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:00.250000 139873423791936 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_800047_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:00.258000 139873423791936 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_800047_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:57:58 - host : ip-26-0-163-147.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 800127) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 800127 -============================================================ -E0703 03:58:00.474000 140174200911680 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 3 (pid: 1311140) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:58:00.512000 140174200911680 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1311064_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:00.542000 140174200911680 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1311064_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:00.550000 140174200911680 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1311064_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:57:57 - host : ip-26-0-173-202.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 1311140) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1311140 -============================================================ -srun: error: ip-26-0-163-147: task 1: Exited with exit code 1 -srun: error: ip-26-0-166-125: task 4: Exited with exit code 1 -E0703 03:58:01.018000 140168436774720 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 4 (pid: 328956) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:58:01.034000 140168436774720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_328879_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:01.068000 140168436774720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_328879_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:01.076000 140168436774720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_328879_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:57:58 - host : ip-26-0-173-246.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 328956) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 328956 -============================================================ -srun: error: ip-26-0-173-202: task 5: Exited with exit code 1 -srun: error: ip-26-0-173-246: task 6: Exited with exit code 1 -W0703 03:58:02.338000 140676376291072 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_902680_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:02.897000 140026275124992 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_413204_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 03:58:03.101000 140682037024576 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 902754) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:58:03.113000 140682037024576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_902680_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:03.142000 140682037024576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_902680_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 03:58:03.167000 140031935858496 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 413277) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:58:03.171000 140682037024576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_902680_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:58:02 - host : ip-26-0-165-24.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 902755) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 902755 -[2]: - time : 2024-07-03_03:58:02 - host : ip-26-0-165-24.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 902756) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 902756 -[3]: - time : 2024-07-03_03:58:02 - host : ip-26-0-165-24.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 902757) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 902757 -[4]: - time : 2024-07-03_03:58:02 - host : ip-26-0-165-24.ec2.internal - rank : 28 (local_rank: 4) - exitcode : -6 (pid: 902758) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 902758 -[5]: - time : 2024-07-03_03:58:02 - host : ip-26-0-165-24.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 902759) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 902759 -[6]: - time : 2024-07-03_03:58:02 - host : ip-26-0-165-24.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 902760) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 902760 -[7]: - time : 2024-07-03_03:58:02 - host : ip-26-0-165-24.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 902761) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 902761 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:58:02 - host : ip-26-0-165-24.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 902754) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 902754 -============================================================ -W0703 03:58:03.180000 140031935858496 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_413204_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:03.209000 140031935858496 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_413204_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:58:03.240000 140031935858496 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_413204_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:58:03 - host : ip-26-0-164-207.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 413278) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 413278 -[2]: - time : 2024-07-03_03:58:03 - host : ip-26-0-164-207.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 413279) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 413279 -[3]: - time : 2024-07-03_03:58:03 - host : ip-26-0-164-207.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 413280) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 413280 -[4]: - time : 2024-07-03_03:58:03 - host : ip-26-0-164-207.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 413281) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 413281 -[5]: - time : 2024-07-03_03:58:03 - host : ip-26-0-164-207.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 413282) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 413282 -[6]: - time : 2024-07-03_03:58:03 - host : ip-26-0-164-207.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 413283) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 413283 -[7]: - time : 2024-07-03_03:58:03 - host : ip-26-0-164-207.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 413284) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 413284 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:58:03 - host : ip-26-0-164-207.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 413277) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 413277 -============================================================ -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 2: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-2/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/bench.slurm deleted file mode 100644 index 5608e15496c8542edeb4f451090faa473597ef90..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/config.yaml deleted file mode 100644 index d652bb158bda93da166939dcf665515e9e8bfcd4..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 2 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 256 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/log.out deleted file mode 100644 index 552aba351d561d2d8da732f2754cc201c8636ca5..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/log.out +++ /dev/null @@ -1,6248 +0,0 @@ -======================== -START TIME: Wed Jul 3 00:42:20 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 00:42:26.449000 140584891750208 torch/distributed/run.py:757] -W0703 00:42:26.449000 140584891750208 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.449000 140584891750208 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:42:26.449000 140584891750208 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.458000 140712948295488 torch/distributed/run.py:757] -W0703 00:42:26.458000 140712948295488 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.458000 140712948295488 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:42:26.458000 140712948295488 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.457000 139802089281344 torch/distributed/run.py:757] -W0703 00:42:26.457000 139802089281344 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.457000 139802089281344 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:42:26.457000 139802089281344 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.653000 139713462744896 torch/distributed/run.py:757] -W0703 00:42:26.653000 139713462744896 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.653000 139713462744896 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:42:26.653000 139713462744896 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.655000 139638075492160 torch/distributed/run.py:757] -W0703 00:42:26.655000 139638075492160 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.655000 139638075492160 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:42:26.655000 139638075492160 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.677000 140588349990720 torch/distributed/run.py:757] -W0703 00:42:26.677000 140588349990720 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.677000 140588349990720 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:42:26.677000 140588349990720 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.709000 139705734608704 torch/distributed/run.py:757] -W0703 00:42:26.709000 139705734608704 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.709000 139705734608704 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:42:26.709000 139705734608704 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.768000 140509700273984 torch/distributed/run.py:757] -W0703 00:42:26.768000 140509700273984 torch/distributed/run.py:757] ***************************************** -W0703 00:42:26.768000 140509700273984 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:42:26.768000 140509700273984 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 00:42:51 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=2, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=256, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=2, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256')), -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 00:42:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default3]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=11|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=10|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=14|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=12|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=13|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=9|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=8|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=15|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=4|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=7|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=5|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 00:43:09 [INFO|DP=1|PP=0|TP=6|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 00:43:09 [INFO|DP=1|PP=1|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=2|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=1|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=0|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=7|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=3|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=5|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 00:43:09 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=4|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=6|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 00:43:09 [INFO|DP=0|PP=1|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 00:43:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 00:43:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 00:43:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 00:43:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 00:43:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 00:43:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 00:43:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:43:13 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:43:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 00:43:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 00:43:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 00:43:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 00:43:13.932278 | mbs: 256 | grad_accum: 2 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 00:43:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 00:43:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=13|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=5|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=6|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=12|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=14|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=9|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=15|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=8|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=4|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:43:14 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:43:14 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:43:14 [WARNING|DP=1|PP=1|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:43:14 [WARNING|DP=0|PP=1|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default0]:[rank0]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: trainer.train(dataloader) -[default3]:[rank3]: Traceback (most recent call last): -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank1]: trainer.train(dataloader) -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: output = model(**micro_batch) -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank0]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: output = model(**micro_batch) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default5]:[rank5]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank0]: output = self.o_proj(attention_output) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default1]:[rank9]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: output = self.o_proj(attention_output) -[default2]:[rank10]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 793.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 881.94 MiB is free. Including non-PyTorch memory, this process has 78.46 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: output = self.o_proj(attention_output) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 793.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: output = model(**micro_batch) -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: output = model(**micro_batch) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default0]:[rank8]: sharded_logits = self.model( -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank14]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 881.94 MiB is free. Including non-PyTorch memory, this process has 78.46 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: output = self.o_proj(attention_output) -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: output = self.o_proj(attention_output) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 881.94 MiB is free. Including non-PyTorch memory, this process has 78.46 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.o_proj(attention_output) -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 881.94 MiB is free. Including non-PyTorch memory, this process has 78.46 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 793.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default1]:[rank1]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.36 GiB is free. Including non-PyTorch memory, this process has 77.96 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return row_linear( -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: return row_linear( -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.36 GiB is free. Including non-PyTorch memory, this process has 77.96 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.36 GiB is free. Including non-PyTorch memory, this process has 77.96 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.45 GiB is free. Including non-PyTorch memory, this process has 77.87 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: output = self.o_proj(attention_output) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.45 GiB is free. Including non-PyTorch memory, this process has 77.87 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.45 GiB is free. Including non-PyTorch memory, this process has 77.87 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.36 GiB is free. Including non-PyTorch memory, this process has 77.96 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: output = self.o_proj(attention_output) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 673.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default0]:[rank24]: trainer.train(dataloader) -[default3]:[rank27]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: output = self.o_proj(attention_output) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: output = model(**micro_batch) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 841.94 MiB is free. Including non-PyTorch memory, this process has 78.50 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: output = self.o_proj(attention_output) -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: output = self.o_proj(attention_output) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: output = self.o_proj(attention_output) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 841.94 MiB is free. Including non-PyTorch memory, this process has 78.50 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank24]: return row_linear( -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 673.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: output = self.o_proj(attention_output) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 841.94 MiB is free. Including non-PyTorch memory, this process has 78.50 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: output = self.o_proj(attention_output) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: output = self.o_proj(attention_output) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: return row_linear( -[default3]:[rank19]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 793.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 793.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank25]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 841.94 MiB is free. Including non-PyTorch memory, this process has 78.50 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank22]: output = self.o_proj(attention_output) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 881.94 MiB is free. Including non-PyTorch memory, this process has 78.46 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank16]: output = self.o_proj(attention_output) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: return row_linear( -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank18]: output = self.o_proj(attention_output) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 881.94 MiB is free. Including non-PyTorch memory, this process has 78.46 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: output = self.o_proj(attention_output) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 881.94 MiB is free. Including non-PyTorch memory, this process has 78.46 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: sharded_logits = self.model( -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank17]: output = self.o_proj(attention_output) -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default7]:[rank23]: output = self.o_proj(attention_output) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 793.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 793.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank28]: output = self.o_proj(attention_output) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: return row_linear( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 673.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 68.69 GiB is allocated by PyTorch, and 450.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default2]:[rank34]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd8fb9f2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank34]: frame #1: + 0x5b3a23e (0x7fd93550f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd935509c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd935509f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd93550afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd9354bf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd9354bf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd9354bf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default2]:[rank34]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd9354bf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -n/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() - -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd8fcccc189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[d[default1]:[rank33]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -efault0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank56]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank56]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f464485b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank56]: frame #1: + 0x5b3a23e (0x7f467e37823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #2: c[default1]:[rank33]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank33]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbd64621897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank33]: frame #1: + 0x5b3a23e (0x7fbd9e13e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f467e372c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f467e372f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f467e373fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f467e328371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f467e328371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10[default1]:[rank33]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fbd9e138c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f467e328371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f467e328371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f4645b35189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4645b3c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #11: c10d::Process[default1]:[rank33]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fbd9e138f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -GroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4645b5b978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #12: + 0x5adc309 (0x7f467e31a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #13: + 0x5ae6f10 (0x7f467e324f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fbd9e139fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbd9e0ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #14: + 0x5ae6fa5 (0x7f467e324fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #15: + 0x5124446 (0x7f467d962446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbd9e0ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #16: + 0x1acf4b8 (0x7f467a30d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #17: + 0x5aee004 (0x7f467e32c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbd9e0ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbd9e0ee371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #18: + 0x5af36b5 (0x7f467e3316b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd8fccd3610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd8fccf2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #19: + 0xd2631e (0x7f4690f1b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #12: + 0x5adc309 (0x7fd9354b1309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #20: + 0x47def4 (0x7f4690672ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #21: + 0x1445a6 (0x55e9897885a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55e989781a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #13: + 0x5ae6f10 (0x7fd9354bbf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #23: + 0x150866 (0x55e989794866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55e98977d142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fbd658fb189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fbd65902610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55e989788a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #26: PyObject_Call + 0xbc (0x55e989794f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55e98977b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55e989788a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fbd65921978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55e9897798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #30: + 0x150582 (0x55e989794582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55e9897798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #14: + 0x5ae6fa5 (0x7fd9354bbfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #32: + 0x150582 (0x55e989794582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55e9897798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #34: + 0x150582 (0x55e989794582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #12: + 0x5adc309 (0x7fbd9e0e0309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55e9897798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55e989780f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55e989792c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #38: + 0x211239 (0x55e989855239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #13: + 0x5ae6f10 (0x7fbd9e0eaf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55e989781a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55e98977d3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #15: + 0x5124446 (0x7fd934af9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55e989788a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55e989778c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55e989788a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #16: + 0x1acf4b8 (0x7fd9314a44b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55e9897798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #45: + 0x150582 (0x55e989794582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #14: + 0x5ae6fa5 (0x7fbd9e0eafa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #17: + 0x5aee004 (0x7fd9354c3004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #46: PyObject_Call + 0xbc (0x55e989794f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55e98977b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #48: + 0x150582 (0x55e989794582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #49: PyObject_Call + 0xbc (0x55e989794f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55e98977b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55e989788a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55e989781007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/[default2]:[rank34]: frame #18: + 0x5af36b5 (0x7fd9354c86b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #15: + 0x5124446 (0x7fbd9d728446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #16: + 0x1acf4b8 (0x7fbd9a0d34b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #17: + 0x5aee004 (0x7fbd9e0f2004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -bin/python3.10) -[default0]:[rank56]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55e989792c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #54: + 0x211239 (0x55e989855239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #55: PyObject_Call + 0x207 (0x55e989795067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55e98977b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #18: + 0x5af36b5 (0x7fbd9e0f76b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #57: + 0x150582 (0x55e989794582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55e9897798fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #59: + 0x150582 (0x55e989794582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #19: + 0xd2631e (0x7fd9480b231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #60: PyObject_Call + 0xbc (0x55e989794f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55e98977b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #62: + 0x150582 (0x55e989794582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #19: + 0xd2631e (0x7fbdb0ce131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #20: + 0x47def4 (0x7fd947809ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: frame #63: PyObject_Call + 0xbc (0x55e989794f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank33]: frame #20: + 0x47def4 (0x7fbdb0438ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #21: + 0x1445a6 (0x55bc508f45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55bc508eda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #23: + 0x150866 (0x55bc50900866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55bc508e9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55bc508f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #21: + 0x1445a6 (0x5566253aa5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #26: PyObject_Call + 0xbc (0x55bc50900f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5566253a3a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #23: + 0x150866 (0x5566253b6866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc508e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55662539f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5566253aaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #26: PyObject_Call + 0xbc (0x5566253b6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55662539d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5566253aaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55662539b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55bc508f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55bc508e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #30: + 0x150582 (0x5566253b6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55662539b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #32: + 0x150582 (0x5566253b6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #30: + 0x150582 (0x55bc50900582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55bc508e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55662539b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #34: + 0x150582 (0x5566253b6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55662539b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #32: + 0x150582 (0x55bc50900582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5566253a2f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5566253b4c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55bc508e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #38: + 0x211239 (0x556625477239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5566253a3a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #34: + 0x150582 (0x55bc50900582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55bc508e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55bc508ecf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55662539f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5566253aaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55662539ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5566253aaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55bc508fec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55662539b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #45: + 0x150582 (0x5566253b6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #46: PyObject_Call + 0xbc (0x5566253b6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55662539d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #48: + 0x150582 (0x5566253b6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #49: PyObject_Call + 0xbc (0x5566253b6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #38: + 0x211239 (0x55bc509c1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55662539d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55bc508eda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55bc508e93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55bc508f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55bc508e4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5566253aaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55bc508f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5566253a3007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5566253b4c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55bc508e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #54: + 0x211239 (0x556625477239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #55: PyObject_Call + 0x207 (0x5566253b7067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #45: + 0x150582 (0x55bc50900582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #46: PyObject_Call + 0xbc (0x55bc50900f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55662539d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc508e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #48: + 0x150582 (0x55bc50900582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #49: PyObject_Call + 0xbc (0x55bc50900f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc508e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #57: + 0x150582 (0x5566253b6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55bc508f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55bc508ed007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55bc508fec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #54: + 0x211239 (0x55bc509c1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #55: PyObject_Call + 0x207 (0x55bc50901067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc508e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #57: + 0x150582 (0x55bc50900582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55bc508e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #59: + 0x150582 (0x55bc50900582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #60: PyObject_Call + 0xbc (0x55bc50900f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55bc508e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #62: + 0x150582 (0x55bc50900582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #63: PyObject_Call + 0xbc (0x55bc50900f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank33]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55662539b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #59: + 0x150582 (0x5566253b6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: Traceback (most recent call last): -[default1]:[rank33]: frame #60: PyObject_Call + 0xbc (0x5566253b6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55662539d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #62: + 0x150582 (0x5566253b6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #63: PyObject_Call + 0xbc (0x5566253b6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank58]: dist.recv( -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36][default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel[default2]:[rank58]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank36]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank36][default2]:[rank58]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8e3502a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank36]: frame #1: + 0x5b3a23e (0x7f8e6eb4723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f8e6eb41c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f8e6eb41f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f8e6eb42fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/si[default4]:[rank60]: return forward_call(*args, **kwargs) -te-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8e6eaf7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8e6eaf7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8e6eaf7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8e6eaf7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f8e36304189 in /fsx[default2]:[rank58]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f348e35a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f8e3630b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f8e3632a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #12: + 0x5adc309 (0x7f8e6eae9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #13: + 0x5ae6f10 (0x7f8e6eaf3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: frame #14: + 0x5ae6fa5 (0x7f8e6eaf3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #15: + 0x5124446 (0x7f8e6e131446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #16: + 0x1acf4b8 (0x7f8e6aadc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #17: + 0x5aee004 (0x7f8e6eafb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #18: + 0x5af36b5 (0x7f8e6eb006b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #19: + 0xd2631e (0x7f8e816ea[default2]:[rank58]: frame #1: + 0x5b3a23e (0x7f34c7e7723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f34c7e71c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #20: + 0x47def4 (0x7f8e80e41ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank58]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f34c7e71f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #21: + 0x1445a6 (0x55a107a1a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55a107a13a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #23: + 0x150866 (0x55a107a26866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55a107a0f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55a107a1aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: sharded_logits = self.model( -[default2]:[rank58]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f34c7e72fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #26: PyObject_Call + 0xbc (0x55a107a26f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55a107a0d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55a107a1aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55a107a0b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: frame #30: + 0x150582 (0x55a107a26582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55a107a0b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #32: + 0x150582 (0x55a107a26582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55a107a0b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #34: + 0x150582 (0x55a107a26582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55a107a0b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55a107a12f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55a107a24c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #38: + 0x211239 (0x55a107ae7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55a107a13a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55a107a0f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55a107a1aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55a107a0ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55a107a1aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f34c7e27371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f34c7e27371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55a107a0b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #45: + 0x150582 (0x55a107a26582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #46: PyObject_Call + 0xbc (0x55a107a26f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55a107a0d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: Traceback (most recent call last): -[default4]:[rank36]: frame #48: + 0x150582 (0x55a107a26582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #49: PyObject_Call + 0xbc (0x55a107a26f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55a107a0d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55a107a1aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55a107a13007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55a107a24c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #54: + 0x211239 (0x55a107ae7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #55: PyObject_Call + 0x207 (0x55a107a27067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55a107a0d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #57: + 0x150582 (0x55a107a26582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55a107a0b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #59: + 0x150582 (0x55a107a26582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-clust[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -er/bin/python3.10) -[default4]:[rank36]: frame #60: PyObject_Call + 0xbc (0x55a107a26f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55a107a0d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f34c7e27371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #62: + 0x150582 (0x55a107a26582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #63: PyObject_Call + 0xbc (0x55a107a26f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f34c7e27371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f348f634189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f348f63b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f348f65a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #12: + 0x5adc309 (0x7f34c7e19309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #13: + 0x5ae6f10 (0x7f34c7e23f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: trainer.train(dataloader) -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: frame #14: + 0x5ae6fa5 (0x7f34c7e23fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: frame #15: + 0x5124446 (0x7f34c7461446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: frame #16: + 0x1acf4b8 (0x7f34c3e0c4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: frame #17: + 0x5aee004 (0x7f34c7e2b004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: frame #18: + 0x5af36b5 (0x7f34c7e306b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: frame #19: + 0xd2631e (0x7f34daa1a31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: frame #20: + 0x47def4 (0x7f34da171ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: frame #21: + 0x1445a6 (0x563a27c695a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: output = model(**micro_batch) -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563a27c62a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: frame #23: + 0x150866 (0x563a27c75866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563a27c5e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563a27c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default2]:[rank58]: frame #26: PyObject_Call + 0xbc (0x563a27c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563a27c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563a27c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563a27c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return func(*args, **kwargs) -[default2]:[rank58]: frame #30: + 0x150582 (0x563a27c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563a27c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #32: + 0x150582 (0x563a27c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563a27c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #34: + 0x150582 (0x563a27c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank58]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563a27c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563a27c61f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6467b5c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: frame #1: + 0x5b3a23e (0x7f64a167923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f64a1673c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f64a1673f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563a27c73c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f64a1674fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: frame #38: + 0x211239 (0x563a27d36239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f64a1629371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563a27c62a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default4]:[rank60]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f64a1629371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563a27c5e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563a27c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563a27c59c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f64a1629371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563a27c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563a27c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f64a1629371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: frame #45: + 0x150582 (0x563a27c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6468e36189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6468e3d610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6468e5c978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #12: + 0x5adc309 (0x7f64a161b309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #46: PyObject_Call + 0xbc (0x563a27c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563a27c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #13: + 0x5ae6f10 (0x7f64a1625f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank58]: frame #48: + 0x150582 (0x563a27c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #49: PyObject_Call + 0xbc (0x563a27c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: dist.recv( -[default2]:[rank58]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563a27c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #14: + 0x5ae6fa5 (0x7f64a1625fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank57]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563a27c69a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563a27c62007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank44]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f46b13ec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: frame #1: + 0x5b3a23e (0x7f46eaf0923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f46eaf03c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563a27c73c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f46eaf03f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f46eaf04fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank44]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f46eaeb9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f46eaeb9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f46eaeb9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f46eaeb9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5656c1c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f46b26c6189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f46b26cd610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f46b26ec978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #54: + 0x211239 (0x563a27d36239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #55: PyObject_Call + 0x207 (0x563a27c76067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #12: + 0x5adc309 (0x7f46eaeab309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #13: + 0x5ae6f10 (0x7f46eaeb5f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563a27c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #1: + 0x5b3a23e (0x7f569073923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #15: + 0x5124446 (0x7f64a0c63446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #16: + 0x1acf4b8 (0x7f649d60e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #17: + 0x5aee004 (0x7f64a162d004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #14: + 0x5ae6fa5 (0x7f46eaeb5fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #15: + 0x5124446 (0x7f46ea4f3446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #16: + 0x1acf4b8 (0x7f46e6e9e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #17: + 0x5aee004 (0x7f46eaebd004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #18: + 0x5af36b5 (0x7f46eaec26b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #19: + 0xd2631e (0x7f46fdaac[default1]:[rank57]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: frame #57: + 0x150582 (0x563a27c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #20: + 0x47def4 (0x7f46fd203ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #21: + 0x1445a6 (0x55b49ba0f5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b49ba08a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #18: + 0x5af36b5 (0x7f64a16326b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #23: + 0x150866 (0x55b49ba1b866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b49ba04142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b49ba0fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #19: + 0xd2631e (0x7f64b421c31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #26: PyObject_Call + 0xbc (0x55b49ba1bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b49ba022b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b49ba0fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563a27c5a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b49ba008fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #30: + 0x150582 (0x55b49ba1b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b49ba008fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #32: + 0x150582 (0x55b49ba1b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f68a56e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank60]: frame #20: + 0x47def4 (0x7f64b3973ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #21: + 0x1445a6 (0x55be317405a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b49ba008fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #34: + 0x150582 (0x55b49ba1b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b49ba008fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55be31739a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #59: + 0x150582 (0x563a27c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b49ba07f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b49ba19c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #1: + 0x5b3a23e (0x7f68df20523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #23: + 0x150866 (0x55be3174c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #38: + 0x211239 (0x55b49badc239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b49ba08a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b49ba043e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #60: PyObject_Call + 0xbc (0x563a27c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b49ba0fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b49b9ffc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b49ba0fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b49ba008fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f5690733c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #45: + 0x150582 (0x55b49ba1b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #46: PyObject_Call + 0xbc (0x55b49ba1bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563a27c5c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b49ba022b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #48: + 0x150582 (0x55b49ba1b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #49: PyObject_Call + 0xbc (0x55b49ba1bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b49ba022b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b49ba0fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b49ba08007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b49ba19c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench[default1]:[rank57]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f68df1ffc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f5690733f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #62: + 0x150582 (0x563a27c75582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) --cluster/bin/python3.10) -[default4]:[rank44]: frame #54: + 0x211239 (0x55b49badc239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #55: PyObject_Call + 0x207 (0x55b49ba1c067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b49ba022b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #57: + 0x150582 (0x55b49ba1b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b49ba008fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #59: + 0x150582 (0x55b49ba1b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #60: PyObject_Call + 0xbc (0x55b49ba1bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b49ba022b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55be31735142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55be31740a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #62: + 0x150582 (0x55b49ba1b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #63: PyObject_Call + 0xbc (0x55b49ba1bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank58]: frame #63: PyObject_Call + 0xbc (0x563a27c75f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f5690734fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #26: PyObject_Call + 0xbc (0x55be3174cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank57]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f68df1fff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f68df200fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default4]:[rank60]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55be317332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55be31740a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default3]:[rank59]: Traceback (most recent call last): -[default1]:[rank57]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f68df1b5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f68df1b5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communi[default4]:[rank60]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55be317318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f56906e9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: . This may indicate a possible application crash on rank 0 or a network set up issue. -cation -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank32]: di[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f56906e9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -st.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: Traceback (most recent call last): -[default1]:[rank57]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f68df1b5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #30: + 0x150582 (0x55be3174c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank32]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank32]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ea1139897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank32]: frame #1: + 0x5b3a23e (0x7f3edac5623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/tor[default1]:[rank57]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f68df1b5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f3edac50c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f3edac50f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f3edac51fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3edac06371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3edac06371 in /fsx/ferdi[default3]:[rank59]: trainer.train(dataloader) -nandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3edac06371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f3edac06371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f3ea2413189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f3ea241a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorc[default4]:[rank60]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55be317318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #32: + 0x150582 (0x55be3174c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -h_cuda.so) -[default0]:[rank32]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f3ea2439978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #12: + 0x5adc309 (0x7f3edabf8309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #13: + 0x5ae6f10 (0x7f3edac02f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f68a69c2189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #14: + 0x5ae6fa5 (0x7f3edac02fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #15: + 0x5124446 (0x7f3eda240446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #16: + 0x1acf4b8 (0x7f3ed6beb4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: frame #17: + 0x5aee004 (0x7f3edac0a004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #18: + 0x5af36b5 (0x7f3edac0f6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #19: + 0xd2631e (0x7f3eed7f931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #20: + 0x47def4 (0x7f3eecf50ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #21: + 0x1445a6 (0x556340eb45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #22: _PyObject_MakeTpCall + 0x26b (0x556340eada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #23: + 0x150866 (0x556340ec0866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x556340ea9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #25: _PyFunction_Vectorcall + 0x6c (0x556340eb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #26: PyObject_Call + 0xbc (0x556340ec0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x556340ea72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: frame #28: _PyFunction_Vectorcall + 0x6c (0x556340eb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x556340ea58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #30: + 0x150582 (0x556340ec0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x556340ea58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f56906e9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f56906e9371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #32: + 0x150582 (0x556340ec0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x556340ea58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #34: + 0x150582 (0x556340ec0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f68a69c9610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55be317318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x556340ea58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x556340eacf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #37: _PyObject_Call_Prepend + 0x69 (0x556340ebec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #38: + 0x211239 (0x556340f81239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f5657ef6189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f68a69e8978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #39: _PyObject_MakeTpCall + 0x26b (0x556340eada6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x556340ea93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #41: _PyFunction_Vectorcall + 0x6c (0x556340eb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x556340ea4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: frame #43: _PyFunction_Vectorcall + 0x6c (0x556340eb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x556340ea58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #45: + 0x150582 (0x556340ec0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f5657efd610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #34: + 0x150582 (0x55be3174c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #46: PyObject_Call + 0xbc (0x556340ec0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x556340ea72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #48: + 0x150582 (0x556340ec0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #49: PyObject_Call + 0xbc (0x556340ec0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55be317318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #12: + 0x5adc309 (0x7f68df1a7309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #13: + 0x5ae6f10 (0x7f68df1b1f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x556340ea72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #51: _PyFunction_Vectorcall + 0x6c (0x556340eb4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x556340ead007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #53: _PyObject_Call_Prepend + 0x69 (0x556340ebec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #54: + 0x211239 (0x556340f81239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #55: PyObject_Call + 0x207 (0x556340ec1067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x556340ea72b3 in /fsx/ferdinandmom/miniforge3/envs/env-benc[default7]:[rank47]: Traceback (most recent call last): -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55be31738f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -h-cluster/bin/python3.10) -[default0]:[rank32]: frame #57: + 0x150582 (0x556340ec0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x556340ea58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #59: + 0x150582 (0x556340ec0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #60: PyObject_Call + 0xbc (0x556340ec0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x556340ea72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: frame #14: + 0x5ae6fa5 (0x7f68df1b1fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #62: + 0x150582 (0x556340ec0582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #63: PyObject_Call + 0xbc (0x556340ec0f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f5657f1c978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55be3174ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: frame #15: + 0x5124446 (0x7f68de7ef446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #12: + 0x5adc309 (0x7f56906db309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: trainer.train(dataloader) -[default5]:[rank61]: frame #13: + 0x5ae6f10 (0x7f56906e5f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: Traceback (most recent call last): -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: frame #16: + 0x1acf4b8 (0x7f68db19a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: frame #38: + 0x211239 (0x55be3180d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default4]:[rank60]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55be31739a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: frame #17: + 0x5aee004 (0x7f68df1b9004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #18: + 0x5af36b5 (0x7f68df1be6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: frame #19: + 0xd2631e (0x7f68f1da831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #14: + 0x5ae6fa5 (0x7f56906e5fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: sharded_logits = self.model( -[default5]:[rank61]: frame #15: + 0x5124446 (0x7f568fd23446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55be317353e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #20: + 0x47def4 (0x7f68f14ffef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: frame #16: + 0x1acf4b8 (0x7f568c6ce4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: frame #21: + 0x1445a6 (0x555a7bddf5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = model(**micro_batch) -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = model(**micro_batch) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: frame #17: + 0x5aee004 (0x7f56906ed004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555a7bdd8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: output = model(**micro_batch) -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: frame #18: + 0x5af36b5 (0x7f56906f26b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #19: + 0xd2631e (0x7f56a32dc31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default1]:[rank57]: frame #23: + 0x150866 (0x555a7bdeb866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555a7bdd4142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default5]:[rank61]: frame #20: + 0x47def4 (0x7f56a2a33ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55be31740a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55be31730c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555a7bddfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #26: PyObject_Call + 0xbc (0x555a7bdebf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555a7bdd22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55be31740a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555a7bddfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55be317318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555a7bdd08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default5]:[rank61]: frame #21: + 0x1445a6 (0x55b09dfbe5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: frame #45: + 0x150582 (0x55be3174c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #30: + 0x150582 (0x555a7bdeb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555a7bdd08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: frame #46: PyObject_Call + 0xbc (0x55be3174cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55be317332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: sharded_logits = self.model( -[default1]:[rank57]: frame #32: + 0x150582 (0x555a7bdeb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #48: + 0x150582 (0x55be3174c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank47]: sharded_logits = self.model( -[default5]:[rank61]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b09dfb7a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: frame #23: + 0x150866 (0x55b09dfca866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555a7bdd08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b09dfb3142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #49: PyObject_Call + 0xbc (0x55be3174cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55be317332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55be31740a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank38]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: frame #34: + 0x150582 (0x555a7bdeb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555a7bdd08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55be31739007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default1]:[rank57]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555a7bdd7f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank38]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f83200f2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default4]:[rank60]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55be3174ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #1: + 0x5b3a23e (0x7f8359c0f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f8359c09c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f8359c09f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: frame #54: + 0x211239 (0x55be3180d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f8359c0afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8359bbf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8359bbf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: frame #55: PyObject_Call + 0x207 (0x55be3174d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8359bbf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8359bbf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f83213cc189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55be317332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f83213d3610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f83213f2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #12: + 0x5adc309 (0x7f8359bb1309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: frame #13: + 0x5ae6f10 (0x7f8359bbbf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555a7bde9c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b09dfbea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #26: PyObject_Call + 0xbc (0x55b09dfcaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #14: + 0x5ae6fa5 (0x7f8359bbbfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #15: + 0x5124446 (0x7f83591f9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #16: + 0x1acf4b8 (0x7f8355ba44b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: frame #57: + 0x150582 (0x55be3174c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #17: + 0x5aee004 (0x7f8359bc3004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #18: + 0x5af36b5 (0x7f8359bc86b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #19: + 0xd2631e (0x7f836c7b231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pipeline_state.run_communication() -[default1]:[rank57]: frame #38: + 0x211239 (0x555a7beac239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #20: + 0x47def4 (0x7f836bf09ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #21: + 0x1445a6 (0x5619d5dd95a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5619d5dd2a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #23: + 0x150866 (0x5619d5de5866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5619d5dce142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5619d5dd9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #26: PyObject_Call + 0xbc (0x5619d5de5f1c in /fsx/ferdina[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55be317318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ndmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5619d5dcc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5619d5dd9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5619d5dca8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #30: + 0x150582 (0x5619d5de5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5619d5dca8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #32: + 0x150582 (0x5619d5de5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #33: _PyEval_EvalFrameDefault + 0x1[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555a7bdd8a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -3ca (0x5619d5dca8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: frame #59: + 0x150582 (0x55be3174c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #34: + 0x150582 (0x5619d5de5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5619d5dca8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5619d5dd1f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555a7bdd43e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5619d5de3c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #38: + 0x211239 (0x5619d5ea6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5619d5dd2a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: frame #60: PyObject_Call + 0xbc (0x55be3174cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5619d5dce3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5619d5dd9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5619d5dc9c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5619d5dd9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5619d5dca8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #45: + 0x150582 (0x5619d5de5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #46: PyObject_Call + 0xbc (0x5619d5de5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efdb6b66897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b09dfb12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5619d5dcc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #48: + 0x150582 (0x5619d5de5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #49: PyObject_Call + 0xbc (0x5619d5de5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5619d5dcc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5619d5dd9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5619d5dd2007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555a7bddfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5619d5de3c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #54: + 0x211239 (0x5619d5ea6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #55: PyObject_Call + 0x207 (0x5619d5de6067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #1: + 0x5b3a23e (0x7efdf068323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b09dfbea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b09dfaf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5619d5dcc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #57: + 0x150582 (0x5619d5de5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: frame #30: + 0x150582 (0x55b09dfca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5619d5dca8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #59: + 0x150582 (0x5619d5de5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #60: PyObject_Call + 0xbc (0x5619d5de5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5619d5dcc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #62: + 0x150582 (0x5619d5de5[default7]:[rank55]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7efdf067dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555a7bdcfc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555a7bddfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #63: PyObject_Call + 0xbc (0x5619d5de5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7efdf067df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7efdf067efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default5]:[rank61]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b09dfaf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555a7bdd08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #45: + 0x150582 (0x555a7bdeb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efdf0633371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efdf0633371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: frame #46: PyObject_Call + 0xbc (0x555a7bdebf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efdf0633371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efdf0633371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7efdb7e40189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7efdb7e47610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7efdb7e66978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #12: <[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555a7bdd22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -unknown function> + 0x5adc309 (0x7efdf0625309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #13: + 0x5ae6f10 (0x7efdf062ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: frame #48: + 0x150582 (0x555a7bdeb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f36a6dcd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55be317332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #32: + 0x150582 (0x55b09dfca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: frame #1: + 0x5b3a23e (0x7f36e08ea23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: pipeline_state.run_communication() -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f36e08e4c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: frame #49: PyObject_Call + 0xbc (0x555a7bdebf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b09dfaf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f36e08e4f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f36e08e5fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555a7bdd22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f36e089a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: dist.recv( -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: frame #34: + 0x150582 (0x55b09dfca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank55]: frame #14: + 0x5ae6fa5 (0x7efdf062ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: pipeline_state.run_communication() -[default1]:[rank57]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555a7bddfa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555a7bdd8007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f36e089a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f36e089a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555a7bde9c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b09dfaf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #62: + 0x150582 (0x55be3174c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank48]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f36e089a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f36a80a7189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: frame #54: + 0x211239 (0x555a7beac239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank39]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faa85f71897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank39]: frame #1: + 0x5b3a23e (0x7faabfa8e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f36a80ae610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f36a80cd978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #12: + 0x5adc309 (0x7f36e088c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #15: + 0x5124446 (0x7efdefc6d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: return func(*args, **kwargs) -[default4]:[rank60]: frame #63: PyObject_Call + 0xbc (0x55be3174cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank39]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7faabfa88c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7faabfa88f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7faabfa89fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faabfa3e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faabfa3e371 in /fsx/ferdinandmom/miniforge3/envs/[default0]:[rank48]: frame #13: + 0x5ae6f10 (0x7f36e0896f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b09dfb6f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faabfa3e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7faabfa3e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7faa8724b189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7faa87252610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[r[default7]:[rank55]: frame #16: + 0x1acf4b8 (0x7efdec6184b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #17: + 0x5aee004 (0x7efdf0637004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -ank39]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7faa87271978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #12: + 0x5adc309 (0x7faabfa30309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #13: + 0x5ae6f10 (0x7faabfa3af10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #18: + 0x5af36b5 (0x7efdf063c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: frame #55: PyObject_Call + 0x207 (0x555a7bdec067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #14: + 0x5ae6fa5 (0x7faabfa3afa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #14: + 0x5ae6fa5 (0x7f36e0896fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b09dfc8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #15: + 0x5124446 (0x7faabf078446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #16: + 0x1acf4b8 (0x7faabba234b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #17: + 0x5aee004 (0x7faabfa42004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #19: + 0xd2631e (0x7efe0322631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #20: + 0x47def4 (0x7efe0297def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #15: + 0x5124446 (0x7f36dfed4446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #16: + 0x1acf4b8 (0x7f36dc87f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555a7bdd22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #18: + 0x5af36b5 (0x7faabfa476b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #19: + 0xd2631e (0x7faad263131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #20: + 0x47def4 (0x7faad1d88ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #17: + 0x5aee004 (0x7f36e089e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank57]: frame #57: + 0x150582 (0x555a7bdeb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #21: + 0x1445a6 (0x557ef6ce75a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557ef6ce0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #23: + 0x150866 (0x557ef6cf3866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557ef6cdc142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #21: + 0x1445a6 (0x5613a01815a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5613a017aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: frame #38: + 0x211239 (0x55b09e08b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: frame #25: _PyFunction_Vectorcall + 0x6c (0x557ef6ce7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #26: PyObject_Call + 0xbc (0x557ef6cf3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x557ef6cda2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #18: + 0x5af36b5 (0x7f36e08a36b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank61]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b09dfb7a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #28: _PyFunction_Vectorcall + 0x6c (0x557ef6ce7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x557ef6cd88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #30: + 0x150582 (0x557ef6cf3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x557ef6cd88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #19: + 0xd2631e (0x7f36f348d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #23: + 0x150866 (0x5613a018d866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe83730f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: frame #32: + 0x150582 (0x557ef6cf3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x557ef6cd88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #34: + 0x150582 (0x557ef6cf3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #20: + 0x47def4 (0x7f36f2be4ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b09dfb33e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x557ef6cd88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557ef6cdff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #37: _PyObject_Call_Prepend + 0x69 (0x557ef6cf1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #38: + 0x211239 (0x557ef6db4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #21: + 0x1445a6 (0x555a2be205a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555a2be19a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: frame #1: + 0x5b3a23e (0x7fe870e2c23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank39]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557ef6ce0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x557ef6cdc3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #41: _PyFunction_Vectorcall + 0x6c (0x557ef6ce7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x557ef6cd7c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #23: + 0x150866 (0x555a2be2c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fe870e26c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555a7bdd08fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #43: _PyFunction_Vectorcall + 0x6c (0x557ef6ce7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x557ef6cd88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #45: + 0x150582 (0x557ef6cf3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5613a0176142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fe870e26f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank59]: dist.recv( -[default7]:[rank39]: frame #46: PyObject_Call + 0xbc (0x557ef6cf3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x557ef6cda2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #48: + 0x150582 (0x557ef6cf3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #49: PyObject_Call + 0xbc (0x557ef6cf3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x557ef6cda2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555a2be15142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555a2be20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5613a0181a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fe870e27fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b09dfbea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #51: _PyFunction_Vectorcall + 0x6c (0x557ef6ce7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557ef6ce0007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #53: _PyObject_Call_Prepend + 0x69 (0x557ef6cf1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #54: + 0x211239 (0x557ef6db4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #55: PyObject_Call + 0x207 (0x557ef6cf4067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x557ef6cda2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #57: + 0x150582 (0x557ef6cf3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cl[default0]:[rank48]: frame #26: PyObject_Call + 0xbc (0x555a2be2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -uster/bin/python3.10) -[default7]:[rank39]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x557ef6cd88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #59: + 0x150582 (0x557ef6cf3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #60: PyObject_Call + 0xbc (0x557ef6cf3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x557ef6cda2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #26: PyObject_Call + 0xbc (0x5613a018df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe870ddc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b09dfaec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #62: + 0x150582 (0x557ef6cf3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #63: PyObject_Call + 0xbc (0x557ef6cf3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank48]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555a2be132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default0]:[rank48]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555a2be20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5613a01742b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe870ddc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #59: + 0x150582 (0x555a7bdeb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[def[default0]:[rank48]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555a2be118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #30: + 0x150582 (0x555a2be2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555a2be118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5613a0181a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -ault5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: frame #32: + 0x150582 (0x555a2be2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555a2be118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #34: + 0x150582 (0x555a2be2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555a2be118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5613a01728fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: return func(*args, **kwargs) -[default5]:[rank61]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b09dfbea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from[default0]:[rank48]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555a2be18f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: dist.recv( -[default1]:[rank57]: frame #60: PyObject_Call + 0xbc (0x555a7bdebf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag[default7]:[rank55]: frame #30: + 0x150582 (0x5613a018d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5613a01728fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #32: + 0x150582 (0x5613a018d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5613a01728fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555a2be2ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #34: + 0x150582 (0x5613a018d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b09dfaf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: frame #38: + 0x211239 (0x555a2beed239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: frame #45: + 0x150582 (0x55b09dfca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank37]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1e75493897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank37]: frame #1: + 0x5b3a23e (0x7f1eaefb023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f1eaefaac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5613a01728fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5613a0179f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555a2be19a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank61]: frame #46: PyObject_Call + 0xbc (0x55b09dfcaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1eaefaaf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1eaefabfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1eaef60371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1eaef60371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1eaef60371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libto[default7]:[rank55]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5613a018bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe870ddc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: return func(*args, **kwargs) -[default5]:[rank61]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b09dfb12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -rch_cpu.so) -[default5]:[rank37]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1eaef60371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555a2be153e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #38: + 0x211239 (0x5613a024e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555a2be20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5613a017aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank43]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f07681be897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank61]: frame #48: + 0x150582 (0x55b09dfca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f1e7676d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f1e76774610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f1e76793978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #12: + 0x5adc309 (0x7f1eaef52309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #13: [default7]:[rank55]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5613a01763e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe870ddc371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #49: PyObject_Call + 0xbc (0x55b09dfcaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -+ 0x5ae6f10 (0x7f1eaef5cf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5613a0181a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555a2be10c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #14: + 0x5ae6fa5 (0x7f1eaef5cfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #15: + 0x5124446 (0x7f1eae59a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #16: + 0x1acf4b8 (0x7f1eaaf454b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #17: + 0x5aee004 (0x7f1eaef64004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #18: + 0x5af36b5 (0x7f1eaef696b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #19: + 0xd2631e (0x7f1ec1b53[default7]:[rank55]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5613a0171c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #1: + 0x5b3a23e (0x7f07a1cdb23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fe8385e9189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555a7bdd22b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #20: + 0x47def4 (0x7f1ec12aaef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #21: + 0x1445a6 (0x5648f95575a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5648f9550a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #23: + 0x150866 (0x5648f9563866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5648f954c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5613a0181a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: frame #62: + 0x150582 (0x555a7bdeb582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5648f9557a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555a2be20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f07a1cd5c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fe8385f0610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank57]: frame #63: PyObject_Call + 0xbc (0x555a7bdebf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #26: PyObject_Call + 0xbc (0x5648f9563f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5648f954a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5648f9557a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5648f95488fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555a2be118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fe83860f978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4362eea897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank37]: frame #30: + 0x150582 (0x5648f9563582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5648f95488fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #32: + 0x150582 (0x5648f9563582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #45: + 0x150582 (0x555a2be2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #46: PyObject_Call + 0xbc (0x555a2be2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank59]: frame #1: + 0x5b3a23e (0x7f439ca0723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b09dfb12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5648f95488fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #34: + 0x150582 (0x5648f9563582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5648f95488fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5648f954ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5648f9561c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #38: + 0x211239 (0x5648f9624239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555a2be132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #12: + 0x5adc309 (0x7fe870dce309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f439ca01c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b09dfbea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5648f9550a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5648f954c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5648f9557a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5648f9547c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5613a01728fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f07a1cd5f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f439ca01f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b09dfb7007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f439ca02fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5648f9557a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5648f95488fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #45: + 0x150582 (0x5648f9563582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #46: PyObject_Call + 0xbc (0x5648f9563f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #48: + 0x150582 (0x555a2be2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2b47123897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank61]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b09dfc8c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5648f954a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #48: + 0x150582 (0x5648f9563582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #49: PyObject_Call + 0xbc (0x5648f9563f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #45: + 0x150582 (0x5613a018d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #13: + 0x5ae6f10 (0x7fe870dd8f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f439c9b7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5648f954a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5648f9557a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5648f9550007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5648f9561c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #1: + 0x5b3a23e (0x7f2b80c4023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2b80c3ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: trainer.train(dataloader) -[default5]:[rank37]: frame #54: + 0x211239 (0x5648f9624239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #55: PyObject_Call + 0x207 (0x5648f9564067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5648f954a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2b80c3af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f2b80c3bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f439c9b7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f439c9b7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #57: + 0x150582 (0x5648f9563582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5648f95488fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #59: + 0x150582 (0x5648f9563582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #60: PyObject_Call + 0xbc (0x5648f9563f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2b80bf0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f07a1cd6fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #54: + 0x211239 (0x55b09e08b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5648f954a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #62: + 0x150582 (0x5648f9563582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #63: PyObject_Call + 0xbc (0x5648f9563f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #14: + 0x5ae6fa5 (0x7fe870dd8fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2b80bf0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank55]: frame #46: PyObject_Call + 0xbc (0x5613a018df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #15: + 0x5124446 (0x7fe870416446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: frame #55: PyObject_Call + 0x207 (0x55b09dfcb067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f439c9b7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f43641c4189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default7]:[rank55]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5613a01742b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #49: PyObject_Call + 0xbc (0x555a2be2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2b80bf0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f43641cb610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b09dfb12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: frame #48: + 0x150582 (0x5613a018d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #49: PyObject_Call + 0xbc (0x5613a018df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5613a01742b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5613a0181a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5613a017a007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555a2be132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5613a018bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench[default3]:[rank43]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f07a1c8b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #16: + 0x1acf4b8 (0x7fe86cdc14b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f43641ea978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:-cluster/bin/python3.10) -[default7]:[rank47]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f2b80bf0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #57: + 0x150582 (0x55b09dfca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_s[default7]:[rank55]: frame #54: + 0x211239 (0x5613a024e239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #55: PyObject_Call + 0x207 (0x5613a018e067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f07a1c8b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #12: + 0x5adc309 (0x7f439c9a9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -tate_buffer( -[default0]:[rank48]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555a2be20a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #17: + 0x5aee004 (0x7fe870de0004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b09dfaf8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35[default7]:[rank55]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5613a01742b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2b483fd189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #13: + 0x5ae6f10 (0x7f439c9b3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555a2be19007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #18: + 0x5af36b5 (0x7fe870de56b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f07a1c8b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #59: + 0x150582 (0x55b09dfca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: frame #57: + 0x150582 (0x5613a018d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2b48404610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f07a1c8b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank48]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555a2be2ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2b48423978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #14: + 0x5ae6fa5 (0x7f439c9b3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #15: + 0x5124446 (0x7f439bff1446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank35]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9368387897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: frame #1: + 0x5b3a23e (0x7f93a1ea423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f93a1e9ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f93a1e9ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]:[default7]:[rank55]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5613a01728fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #19: + 0xd2631e (0x7fe8839cf31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter - frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f93a1e9ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93a1e54371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93a1e54371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93a1e54371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f93a1e54371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[defaul[default7]:[rank55]: frame #59: + 0x150582 (0x5613a018d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #60: PyObject_Call + 0xbc (0x5613a018df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5613a01742b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #12: + 0x5adc309 (0x7f2b80be2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #60: PyObject_Call + 0xbc (0x55b09dfcaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b09dfb12b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -t3]:[rank35]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f9369661189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f9369668610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f9369687978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #12: + 0x5adc309 (0x7f93a1e46309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #13: + 0x5ae[default7]:[rank55]: frame #62: + 0x150582 (0x5613a018d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #63: PyObject_Call + 0xbc (0x5613a018df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f0769498189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #13: + 0x5ae6f10 (0x7f2b80becf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #16: + 0x1acf4b8 (0x7f439899c4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #17: + 0x5aee004 (0x7f439c9bb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -6f10 (0x7f93a1e50f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank43]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f076949f610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: Traceback (most recent call last): -[default3]:[rank35]: frame #14: + 0x5ae6fa5 (0x7f93a1e50fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #15: + 0x5124446 (0x7f93a148e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #16: + 0x1acf4b8 (0x7f939de394b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #54: + 0x211239 (0x555a2beed239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #55: PyObject_Call + 0x207 (0x555a2be2d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f07694be978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: frame #17: + 0x5aee004 (0x7f93a1e58004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #18: + 0x5af36b5 (0x7f93a1e5d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #19: + 0xd2631e (0x7f93b4a4731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555a2be132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #57: + 0x150582 (0x555a2be2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555a2be118fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #12: + 0x5adc309 (0x7f07a1c7d309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #18: + 0x5af36b5 (0x7f439c9c06b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #62: + 0x150582 (0x55b09dfca582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #20: + 0x47def4 (0x7f93b419eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #21: + 0x1445a6 (0x555b758b25a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555b758aba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #23: + 0x150866 (0x555b758be866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #59: + 0x150582 (0x555a2be2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #60: PyObject_Call + 0xbc (0x555a2be2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555a2be132b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #62: + 0x150582 (0x555a2be2c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #13: + 0x5ae6f10 (0x7f07a1c87f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: trainer.train(dataloader) -[default3]:[rank35]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555b758a7142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555b758b2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #26: PyObject_Call + 0xbc (0x555b758bef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #63: PyObject_Call + 0xbc (0x555a2be2cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: frame #14: + 0x5ae6fa5 (0x7f2b80becfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: frame #63: PyObject_Call + 0xbc (0x55b09dfcaf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555b758a52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555b758b2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555b758a38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #30: + 0x150582 (0x555b758be582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default7]:[rank47]: frame #15: + 0x5124446 (0x7f2b8022a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank35]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555b758a38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #32: + 0x150582 (0x555b758be582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555b758a38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: frame #16: + 0x1acf4b8 (0x7f2b7cbd54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: frame #34: + 0x150582 (0x555b758be582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555b758a38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555b758aaf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555b758bcc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #38: + 0x211239 (0x555b7597f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555b758aba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555b758a73e6 in /fsx/ferdinandmom/miniforge3/envs/en[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[default7]:[rank47]: frame #17: + 0x5aee004 (0x7f2b80bf4004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -v-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555b758b2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555b758a2c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555b758b2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_s[default7]:[rank47]: frame #18: + 0x5af36b5 (0x7f2b80bf96b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #19: + 0xd2631e (0x7f43af5aa31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #20: + 0x47def4 (0x7f43aed01ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555b758a38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #45: + 0x150582 (0x555b758be582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #46: PyObject_Call + 0xbc (0x555b758bef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555b758a52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #48: + 0x150582 (0x555b758be582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -tate_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: frame #19: + 0xd2631e (0x7f2b937e331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: frame #49: PyObject_Call + 0xbc (0x555b758bef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555b758a52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555b758b2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555b758ab007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[d[default3]:[rank43]: frame #14: + 0x5ae6fa5 (0x7f07a1c87fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -efault6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: frame #20: + 0x47def4 (0x7f2b92f3aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default3]:[rank59]: frame #21: + 0x1445a6 (0x5636a7f405a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5636a7f39a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank54]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank54]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbae43b6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank40]: frame #20: + 0x47def4 (0x7fe883126ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: frame #1: + 0x5b3a23e (0x7fbb1ded323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fbb1decdc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fbb1decdf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fbb1decefd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbb1de83371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/l[default7]:[rank47]: frame #21: + 0x1445a6 (0x5594ae90c5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #15: + 0x5124446 (0x7f07a12c5446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #23: + 0x150866 (0x5636a7f4c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5636a7f35142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5636a7f40a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbb1de83371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbb1de83371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #21: + 0x1445a6 (0x5595d373c5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbb1de83371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fbae5690189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fbae5697610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fbae56b6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: frame #12: <[default3]:[rank43]: frame #16: + 0x1acf4b8 (0x7f079dc704b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5594ae905a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #26: PyObject_Call + 0xbc (0x5636a7f4cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555b758bcc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #54: + 0x211239 (0x555b7597f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #55: PyObject_Call + 0x207 (0x555b758bf067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555b758a52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -unknown function> + 0x5adc309 (0x7fbb1de75309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #13: + 0x5ae6f10 (0x7fbb1de7ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #17: + 0x5aee004 (0x7f07a1c8f004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: frame #57: + 0x150582 (0x555b758be582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555b758a38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #59: + 0x150582 (0x555b758be582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #14: + 0x5ae6fa5 (0x7fbb1de7ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #15: + 0x5124446 (0x7fbb1d4bd446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #23: + 0x150866 (0x5594ae918866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5636a7f332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #60: PyObject_Call + 0xbc (0x555b758bef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555b758a52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #62: + 0x150582 (0x555b758be582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #63: PyObject_Call + 0xbc (0x555b758bef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #16: + 0x1acf4b8 (0x7fbb19e684b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #17: + 0x5aee004 (0x7fbb1de87004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #18: + 0x5af36b5 (0x7fbb1de8c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #19: + 0xd2631e (0x7fbb30a7631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5594ae901142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: sharded_logits = self.model( -[default3]:[rank35]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank54]: frame #20: + 0x47def4 (0x7fbb301cdef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #21: + 0x1445a6 (0x55e585d4e5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55e585d47a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #23: + 0x150866 (0x55e585d5a866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5595d3735a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #23: + 0x150866 (0x5595d3748866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5636a7f40a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55e585d43142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55e585d4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #26: PyObject_Call + 0xbc (0x55e585d5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55e585d412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #18: + 0x5af36b5 (0x7f07a1c946b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #19: + 0xd2631e (0x7f07b487e31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55e585d4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55e585d3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #30: + 0x150582 (0x55e585d5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55e585d3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #32: + 0x150582 (0x55e585d5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5595d3731142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55e585d3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #34: + 0x150582 (0x55e585d5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55e585d3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55e585d46f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5595d373ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #20: + 0x47def4 (0x7f07b3fd5ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5636a7f318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55e585d58c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #38: + 0x211239 (0x55e585e1b239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55e585d47a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55e585d433e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #21: + 0x1445a6 (0x5641ca3445a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55e585d4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55e585d3ec5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55e585d4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55e585d3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #45: + 0x150582 (0x55e585d5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #46: PyObject_Call + 0xbc (0x55e585d5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55e585d412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-clu[default7]:[rank47]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5594ae90ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #30: + 0x150582 (0x5636a7f4c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -ster/bin/python3.10) -[default6]:[rank54]: frame #48: + 0x150582 (0x55e585d5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #49: PyObject_Call + 0xbc (0x55e585d5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55e585d412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55e585d4ea2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55e585d47007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55e585d58c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #54: + 0x211239 (0x55e585e1b239 in /fsx/ferdinandmom/miniforg[default0]:[rank40]: frame #26: PyObject_Call + 0xbc (0x5595d3748f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: output = model(**micro_batch) -e3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #26: PyObject_Call + 0xbc (0x5594ae918f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5641ca33da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: frame #55: PyObject_Call + 0x207 (0x55e585d5b067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55e585d412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #57: + 0x150582 (0x55e585d5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55e585d3f8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #59: + 0x150582 (0x55e585d5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #60: PyObject_Call + 0xbc (0x55e585d5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55e585d412b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bi[default7]:[rank47]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5594ae8ff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5595d372f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #23: + 0x150866 (0x5641ca350866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5636a7f318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -n/python3.10) -[default6]:[rank54]: frame #62: + 0x150582 (0x55e585d5a582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5641ca339142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: frame #63: PyObject_Call + 0xbc (0x55e585d5af1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5594ae90ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5595d373ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5641ca344a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #32: + 0x150582 (0x5636a7f4c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5636a7f318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5594ae8fd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: frame #30: + 0x150582 (0x5594ae918582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default3]:[rank43]: frame #26: PyObject_Call + 0xbc (0x5641ca350f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5641ca3372b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #34: + 0x150582 (0x5636a7f4c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5641ca344a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5641ca3358fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: frame #30: + 0x150582 (0x5641ca350582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5595d372d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5636a7f318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #30: + 0x150582 (0x5595d3748582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5595d372d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: sharded_logits = self.model( -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5641ca3358fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5594ae8fd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5636a7f38f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: frame #32: + 0x150582 (0x5595d3748582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: frame #32: + 0x150582 (0x5641ca350582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5636a7f4ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: sharded_logits = self.model( -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5595d372d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: frame #38: + 0x211239 (0x5636a800d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5636a7f39a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default0]:[rank40]: frame #34: + 0x150582 (0x5595d3748582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5636a7f353e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5636a7f40a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: Traceback (most recent call last): -[default1]:[rank41]: trainer.train(dataloader) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5595d372d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5595d3734f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5636a7f30c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5595d3746c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: trainer.train(dataloader) -[default3]:[rank59]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5636a7f40a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5636a7f318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: frame #45: + 0x150582 (0x5636a7f4c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: trainer.train(dataloader) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: frame #32: + 0x150582 (0x5594ae918582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5594ae8fd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: frame #46: PyObject_Call + 0xbc (0x5636a7f4cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5641ca3358fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: frame #34: + 0x150582 (0x5641ca350582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5641ca3358fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5636a7f332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: frame #48: + 0x150582 (0x5636a7f4c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: frame #34: + 0x150582 (0x5594ae918582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5594ae8fd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5594ae904f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5641ca33cf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #38: + 0x211239 (0x5595d3809239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5594ae916c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #49: PyObject_Call + 0xbc (0x5636a7f4cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5636a7f332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5636a7f40a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: frame #38: + 0x211239 (0x5594ae9d9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5594ae905a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5595d3735a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5636a7f39007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5594ae9013e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5595d37313e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5641ca34ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5636a7f4ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5594ae90ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: frame #38: + 0x211239 (0x5641ca411239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5594ae8fcc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #54: + 0x211239 (0x5636a800d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5594ae90ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: frame #55: PyObject_Call + 0x207 (0x5636a7f4d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5636a7f332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: sharded_logits = self.model( -[default3]:[rank43]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5641ca33da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5641ca3393e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5595d373ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5594ae8fd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #45: + 0x150582 (0x5594ae918582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: frame #46: PyObject_Call + 0xbc (0x5594ae918f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return func(*args, **kwargs) -[default3]:[rank43]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5641ca344a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5641ca334c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5594ae8ff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default3]:[rank59]: frame #57: + 0x150582 (0x5636a7f4c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5636a7f318fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5641ca344a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #48: + 0x150582 (0x5594ae918582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank50]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47]: frame #49: PyObject_Call + 0xbc (0x5594ae918f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #59: + 0x150582 (0x5636a7f4c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #60: PyObject_Call + 0xbc (0x5636a7f4cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5641ca3358fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5595d372cc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5594ae8ff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5595d373ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5636a7f332b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank45]: sharded_logits = self.model( -[default6]:[rank62]: dist.recv( -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5595d372d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #45: + 0x150582 (0x5595d3748582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff31edd1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: frame #62: + 0x150582 (0x5636a7f4c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #63: PyObject_Call + 0xbc (0x5636a7f4cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default6]:[rank62]: return func(*args, **kwargs) -[default2]:[rank50]: frame #1: + 0x5b3a23e (0x7ff3588ee23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #45: + 0x150582 (0x5641ca350582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5594ae90ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff3588e8c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #46: PyObject_Call + 0xbc (0x5641ca350f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5641ca3372b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default2]:[rank50]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff3588e8f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff3588e9fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5594ae905007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5594ae916c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank50]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff35889e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank50]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff35889e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0bf3ff0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: frame #1: + 0x5b3a23e (0x7f0c2db0d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff35889e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff35889e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: dist.recv( -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default6]:[rank62]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0c2db07c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0c2db07f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0c2db08fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0c2dabd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default7]:[rank47]: frame #54: + 0x211239 (0x5594ae9d9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #48: + 0x150582 (0x5641ca350582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #46: PyObject_Call + 0xbc (0x5595d3748f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5595d372f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0c2dabd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff3200ab189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #55: PyObject_Call + 0x207 (0x5594ae919067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5594ae8ff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0c2dabd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0c2dabd371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff3200b2610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff3200d1978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #57: + 0x150582 (0x5594ae918582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #48: + 0x150582 (0x5595d3748582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f0bf52ca189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: return func(*args, **kwargs) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0bf52d1610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0bf52f0978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: frame #49: PyObject_Call + 0xbc (0x5641ca350f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5594ae8fd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #12: + 0x5adc309 (0x7f0c2daaf309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: frame #59: + 0x150582 (0x5594ae918582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: frame #13: + 0x5ae6f10 (0x7f0c2dab9f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5641ca3372b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank50]: frame #12: + 0x5adc309 (0x7ff358890309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #13: + 0x5ae6f10 (0x7ff35889af10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default7]:[rank63]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f034cb02897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5641ca344a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: frame #1: + 0x5b3a23e (0x7f038661f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #14: + 0x5ae6fa5 (0x7ff35889afa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #15: + 0x5124446 (0x7ff357ed8446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0386619c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0386619f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f038661afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank43]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5641ca33d007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f03865cf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5641ca34ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #14: + 0x5ae6fa5 (0x7f0c2dab9fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #15: + 0x5124446 (0x7f0c2d0f7446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f03865cf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: return func(*args, **kwargs) -[default0]:[rank40]: frame #49: PyObject_Call + 0xbc (0x5595d3748f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5595d372f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #16: + 0x1acf4b8 (0x7f0c29aa24b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #17: + 0x5aee004 (0x7f0c2dac1004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: frame #16: + 0x1acf4b8 (0x7ff3548834b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #17: + 0x5aee004 (0x7ff3588a2004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #60: PyObject_Call + 0xbc (0x5594ae918f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f03865cf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: frame #54: + 0x211239 (0x5641ca411239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #55: PyObject_Call + 0x207 (0x5641ca351067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #18: + 0x5af36b5 (0x7f0c2dac66b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #18: + 0x5af36b5 (0x7ff3588a76b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: frame #19: + 0xd2631e (0x7f0c406b031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: frame #20: + 0x47def4 (0x7f0c3fe07ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default7]:[rank47]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5594ae8ff2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5641ca3372b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f03865cf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank43]: frame #57: + 0x150582 (0x5641ca350582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f034dddc189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f034dde3610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank43]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5641ca3358fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #59: + 0x150582 (0x5641ca350582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f034de02978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: frame #62: + 0x150582 (0x5594ae918582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #12: + 0x5adc309 (0x7f03865c1309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #21: + 0x1445a6 (0x556cef0d75a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #22: _PyObject_MakeTpCall + 0x26b (0x556cef0d0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #60: PyObject_Call + 0xbc (0x5641ca350f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: frame #23: + 0x150866 (0x556cef0e3866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default7]:[rank63]: frame #13: + 0x5ae6f10 (0x7f03865cbf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: frame #63: PyObject_Call + 0xbc (0x5594ae918f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank62]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x556cef0cc142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #25: _PyFunction_Vectorcall + 0x6c (0x556cef0d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: frame #19: + 0xd2631e (0x7ff36b49131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #20: + 0x47def4 (0x7ff36abe8ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5641ca3372b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #26: PyObject_Call + 0xbc (0x556cef0e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x556cef0ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa273603897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: frame #1: + 0x5b3a23e (0x7fa2ad12023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5595d373ca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: frame #28: _PyFunction_Vectorcall + 0x6c (0x556cef0d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #21: + 0x1445a6 (0x559cb7a3f5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #22: _PyObject_MakeTpCall + 0x26b (0x559cb7a38a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank62]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x556cef0c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5595d3735007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5595d3746c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #30: + 0x150582 (0x556cef0e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #14: + 0x5ae6fa5 (0x7f03865cbfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x556cef0c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #32: + 0x150582 (0x556cef0e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x556cef0c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fa2ad11ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: frame #23: + 0x150866 (0x559cb7a4b866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: frame #34: + 0x150582 (0x556cef0e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x559cb7a34142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #62: + 0x150582 (0x5641ca350582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #63: PyObject_Call + 0xbc (0x5641ca350f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x556cef0c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fa2ad11af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x556cef0cff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank62]: frame #37: _PyObject_Call_Prepend + 0x69 (0x556cef0e1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #15: + 0x5124446 (0x7f0385c09446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fa2ad11bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa2ad0d0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: frame #38: + 0x211239 (0x556cef1a4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #39: _PyObject_MakeTpCall + 0x26b (0x556cef0d0a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x556cef0cc3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: dist.recv( -[default0]:[rank40]: frame #54: + 0x211239 (0x5595d3809239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #55: PyObject_Call + 0x207 (0x5595d3749067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #41: _PyFunction_Vectorcall + 0x6c (0x556cef0d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x556cef0c7c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa2ad0d0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: frame #16: + 0x1acf4b8 (0x7f03825b44b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: frame #25: _PyFunction_Vectorcall + 0x6c (0x559cb7a3fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #26: PyObject_Call + 0xbc (0x559cb7a4bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: frame #43: _PyFunction_Vectorcall + 0x6c (0x556cef0d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x556cef0c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x559cb7a322b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: frame #17: + 0x5aee004 (0x7f03865d3004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: return func(*args, **kwargs) -[default4]:[rank52]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa2ad0d0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5595d372f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #45: + 0x150582 (0x556cef0e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #28: _PyFunction_Vectorcall + 0x6c (0x559cb7a3fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #57: + 0x150582 (0x5595d3748582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #18: + 0x5af36b5 (0x7f03865d86b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5595d372d8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #46: PyObject_Call + 0xbc (0x556cef0e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x556cef0ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fa2ad0d0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #59: + 0x150582 (0x5595d3748582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #19: + 0xd2631e (0x7f03991c231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x559cb7a308fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #30: + 0x150582 (0x559cb7a4b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank62]: frame #48: + 0x150582 (0x556cef0e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #49: PyObject_Call + 0xbc (0x556cef0e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fa2748dd189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fa2748e4610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: frame #20: + 0x47def4 (0x7f0398919ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fa274903978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x559cb7a308fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #32: + 0x150582 (0x559cb7a4b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: frame #21: + 0x1445a6 (0x564d633b35a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x556cef0ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #51: _PyFunction_Vectorcall + 0x6c (0x556cef0d7a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #12: + 0x5adc309 (0x7fa2ad0c2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #13: + 0x5ae6f10 (0x7fa2ad0ccf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x556cef0d0007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x559cb7a308fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #34: + 0x150582 (0x559cb7a4b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: pipeline_state.run_communication() -[default2]:[rank42]: trainer.train(dataloader) -[default7]:[rank63]: frame #22: _PyObject_MakeTpCall + 0x26b (0x564d633aca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #14: + 0x5ae6fa5 (0x7fa2ad0ccfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: frame #23: + 0x150866 (0x564d633bf866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #53: _PyObject_Call_Prepend + 0x69 (0x556cef0e1c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #15: + 0x5124446 (0x7fa2ac70a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: pipeline_state.run_communication() -[default6]:[rank62]: frame #54: + 0x211239 (0x556cef1a4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x559cb7a308fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x564d633a8142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: frame #25: _PyFunction_Vectorcall + 0x6c (0x564d633b3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #16: + 0x1acf4b8 (0x7fa2a90b54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #60: PyObject_Call + 0xbc (0x5595d3748f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #26: PyObject_Call + 0xbc (0x564d633bff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x564d633a62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank50]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x559cb7a37f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #37: _PyObject_Call_Prepend + 0x69 (0x559cb7a49c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5595d372f2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #28: _PyFunction_Vectorcall + 0x6c (0x564d633b3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x564d633a48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #38: + 0x211239 (0x559cb7b0c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #17: + 0x5aee004 (0x7fa2ad0d4004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: frame #30: + 0x150582 (0x564d633bf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x564d633a48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #32: + 0x150582 (0x564d633bf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #18: + 0x5af36b5 (0x7fa2ad0d96b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x564d633a48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #34: + 0x150582 (0x564d633bf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #39: _PyObject_MakeTpCall + 0x26b (0x559cb7a38a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: dist.recv( -[default7]:[rank63]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x564d633a48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #55: PyObject_Call + 0x207 (0x556cef0e4067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #19: + 0xd2631e (0x7fa2bfcc331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x556cef0ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x559cb7a343e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: frame #57: + 0x150582 (0x556cef0e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fafac1c7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: frame #1: + 0x5b3a23e (0x7fafe5ce423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x564d633abf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #41: _PyFunction_Vectorcall + 0x6c (0x559cb7a3fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x556cef0c88fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x559cb7a2fc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: frame #59: + 0x150582 (0x556cef0e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fafe5cdec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: frame #60: PyObject_Call + 0xbc (0x556cef0e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fafe5cdef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x556cef0ca2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #37: _PyObject_Call_Prepend + 0x69 (0x564d633bdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #43: _PyFunction_Vectorcall + 0x6c (0x559cb7a3fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x559cb7a308fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank62]: frame #62: + 0x150582 (0x556cef0e3582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #63: PyObject_Call + 0xbc (0x556cef0e3f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fafe5cdffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: frame #38: + 0x211239 (0x564d63480239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fafe5c94371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #20: + 0x47def4 (0x7fa2bf41aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: frame #39: _PyObject_MakeTpCall + 0x26b (0x564d633aca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x564d633a83e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #21: + 0x1445a6 (0x55704279d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank50]: frame #45: + 0x150582 (0x559cb7a4b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #62: + 0x150582 (0x5595d3748582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #41: _PyFunction_Vectorcall + 0x6c (0x564d633b3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fafe5c94371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #22: _PyObject_MakeTpCall + 0x26b (0x557042796a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #46: PyObject_Call + 0xbc (0x559cb7a4bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #63: PyObject_Call + 0xbc (0x5595d3748f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: output = model(**micro_batch) -[default7]:[rank63]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x564d633a3c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #23: + 0x150866 (0x5570427a9866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fafe5c94371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fafe5c94371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: frame #43: _PyFunction_Vectorcall + 0x6c (0x564d633b3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x564d633a48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #45: + 0x150582 (0x564d633bf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fafad4a1189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: frame #46: PyObject_Call + 0xbc (0x564d633bff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x564d633a62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x559cb7a322b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #48: + 0x150582 (0x559cb7a4b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #49: PyObject_Call + 0xbc (0x559cb7a4bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: frame #48: + 0x150582 (0x564d633bf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #49: PyObject_Call + 0xbc (0x564d633bff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x564d633a62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #51: _PyFunction_Vectorcall + 0x6c (0x564d633b3a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x557042792142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x559cb7a322b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank46]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank63]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x564d633ac007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #53: _PyObject_Call_Prepend + 0x69 (0x564d633bdc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #54: + 0x211239 (0x564d63480239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fafad4a8610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fafad4c7978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #12: + 0x5adc309 (0x7fafe5c86309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank63]: frame #55: PyObject_Call + 0x207 (0x564d633c0067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x564d633a62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #57: + 0x150582 (0x564d633bf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x564d633a48fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #13: + 0x5ae6f10 (0x7fafe5c90f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank46]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6fe0592897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #59: + 0x150582 (0x564d633bf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #60: PyObject_Call + 0xbc (0x564d633bff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x564d633a62b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #62: + 0x150582 (0x564d633bf582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55704279da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default7]:[rank63]: frame #63: PyObject_Call + 0xbc (0x564d633bff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: frame #26: PyObject_Call + 0xbc (0x5570427a9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: frame #14: + 0x5ae6fa5 (0x7fafe5c90fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank53]: frame #15: + 0x5124446 (0x7fafe52ce446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: dist.recv( -[default5]:[rank53]: frame #16: + 0x1acf4b8 (0x7fafe1c794b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #1: + 0x5b3a23e (0x7f701a0af23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #17: + 0x5aee004 (0x7fafe5c98004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: frame #51: _PyFunction_Vectorcall + 0x6c (0x559cb7a3fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x559cb7a38007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f701a0a9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f701a0a9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: frame #53: _PyObject_Call_Prepend + 0x69 (0x559cb7a49c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: frame #18: + 0x5af36b5 (0x7fafe5c9d6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #19: + 0xd2631e (0x7faff888731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: frame #20: + 0x47def4 (0x7faff7fdeef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f701a0aafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: frame #21: + 0x1445a6 (0x55ae186695a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5570427902b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55704279da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f701a05f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: frame #54: + 0x211239 (0x559cb7b0c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank50]: frame #55: PyObject_Call + 0x207 (0x559cb7a4c067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55704278e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #30: + 0x150582 (0x5570427a9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55704278e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x559cb7a322b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank41]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default1]:[rank41]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0f6429897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: frame #32: + 0x150582 (0x5570427a9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55704278e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f701a05f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff48737b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]: frame #1: + 0x5b3a23e (0x7ff4c0e9823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f701a05f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f701a05f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #57: + 0x150582 (0x559cb7a4b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x559cb7a308fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #1: + 0x5b3a23e (0x7fe12ff4623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #59: + 0x150582 (0x559cb7a4b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6fe186c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff4c0e92c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: frame #60: PyObject_Call + 0xbc (0x559cb7a4bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff4c0e92f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55ae18662a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #34: + 0x150582 (0x5570427a9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55704278e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x559cb7a322b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6fe1873610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fe12ff40c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fe12ff40f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6fe1892978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #62: + 0x150582 (0x559cb7a4b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff4c0e93fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x557042795f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff4c0e48371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: frame #12: + 0x5adc309 (0x7f701a051309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #13: + 0x5ae6f10 (0x7f701a05bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff4c0e48371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fe12ff41fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #14: + 0x5ae6fa5 (0x7f701a05bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: frame #15: + 0x5124446 (0x7f7019699446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: frame #16: + 0x1acf4b8 (0x7f70160444b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe12fef6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff4c0e48371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff4c0e48371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff488655189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/pytho[default4]:[rank52]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5570427a7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #38: + 0x211239 (0x55704286a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -n3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #39: _PyObject_MakeTpCall + 0x26b (0x557042796a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #17: + 0x5aee004 (0x7f701a063004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff48865c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe12fef6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #23: + 0x150866 (0x55ae18675866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: frame #18: + 0x5af36b5 (0x7f701a0686b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: frame #19: + 0xd2631e (0x7f702cc5231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: frame #63: PyObject_Call + 0xbc (0x559cb7a4bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff48867b978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55ae1865e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55ae18669a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: frame #20: + 0x47def4 (0x7f702c3a9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: frame #26: PyObject_Call + 0xbc (0x55ae18675f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5570427923e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: frame #12: + 0x5adc309 (0x7ff4c0e3a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe12fef6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe12fef6371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fe0f7703189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: frame #21: + 0x1445a6 (0x564695b115a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55ae1865c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55ae18669a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fe0f770a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55ae1865a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fe0f7729978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #12: + 0x5adc309 (0x7fe12fee8309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55704279da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: frame #13: + 0x5ae6f10 (0x7fe12fef2f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: frame #30: + 0x150582 (0x55ae18675582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55704278dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55ae1865a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55704279da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: sharded_logits = self.model( -[default6]:[rank46]: frame #22: _PyObject_MakeTpCall + 0x26b (0x564695b0aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #23: + 0x150866 (0x564695b1d866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: frame #32: + 0x150582 (0x55ae18675582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: dist.recv( -[default4]:[rank52]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55704278e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #13: + 0x5ae6f10 (0x7ff4c0e44f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55ae1865a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #34: + 0x150582 (0x55ae18675582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #14: + 0x5ae6fa5 (0x7fe12fef2fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default5]:[rank53]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55ae1865a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: frame #15: + 0x5124446 (0x7fe12f530446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #16: + 0x1acf4b8 (0x7fe12bedb4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x564695b06142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55ae18661f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: sharded_logits = self.model( -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank42]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank46]: frame #25: _PyFunction_Vectorcall + 0x6c (0x564695b11a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: frame #26: PyObject_Call + 0xbc (0x564695b1df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55ae18673c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #14: + 0x5ae6fa5 (0x7ff4c0e44fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #15: + 0x5124446 (0x7ff4c0482446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #38: + 0x211239 (0x55ae18736239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x564695b042b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55ae18662a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: frame #17: + 0x5aee004 (0x7fe12fefa004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank41]: frame #16: + 0x1acf4b8 (0x7ff4bce2d4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #28: _PyFunction_Vectorcall + 0x6c (0x564695b11a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #45: + 0x150582 (0x5570427a9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #46: PyObject_Call + 0xbc (0x5570427a9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf0337e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]: frame #18: + 0x5af36b5 (0x7fe12feff6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5570427902b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: frame #19: + 0xd2631e (0x7fe142ae931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank41]: frame #17: + 0x5aee004 (0x7ff4c0e4c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #48: + 0x150582 (0x5570427a9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #49: PyObject_Call + 0xbc (0x5570427a9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #20: + 0x47def4 (0x7fe142240ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x564695b028fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55ae1865e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #1: + 0x5b3a23e (0x7fdf3ce9b23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55ae18669a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fdf3ce95c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #18: + 0x5af36b5 (0x7ff4c0e516b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: frame #19: + 0xd2631e (0x7ff4d3a3b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55ae18659c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55ae18669a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #30: + 0x150582 (0x564695b1d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55ae1865a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x564695b028fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5570427902b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #20: + 0x47def4 (0x7ff4d3192ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank41]: frame #21: + 0x1445a6 (0x55ad582a55a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fdf3ce95f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank42]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fdf3ce96fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #45: + 0x150582 (0x55ae18675582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #46: PyObject_Call + 0xbc (0x55ae18675f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #32: + 0x150582 (0x564695b1d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x564695b028fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55704279da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55ae1865c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #34: + 0x150582 (0x564695b1d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #48: + 0x150582 (0x55ae18675582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #49: PyObject_Call + 0xbc (0x55ae18675f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55ad5829ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdf3ce4b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdf3ce4b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x557042796007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdf3ce4b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #23: + 0x150866 (0x55ad582b1866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #21: + 0x1445a6 (0x55d1bb74f5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5570427a7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x564695b028fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #54: + 0x211239 (0x55704286a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdf3ce4b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fdf04658189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55ae1865c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #55: PyObject_Call + 0x207 (0x5570427aa067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x564695b09f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #37: _PyObject_Call_Prepend + 0x69 (0x564695b1bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55ae18669a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fdf0465f610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d1bb748a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #23: + 0x150866 (0x55d1bb75b866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55ae18662007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fdf0467e978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #12: + 0x5adc309 (0x7fdf3ce3d309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55ae18673c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d1bb744142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55ad5829a142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55ad582a5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5570427902b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #57: + 0x150582 (0x5570427a9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #13: + 0x5ae6f10 (0x7fdf3ce47f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: frame #26: PyObject_Call + 0xbc (0x55ad582b1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #54: + 0x211239 (0x55ae18736239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #55: PyObject_Call + 0x207 (0x55ae18676067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55ad582982b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55ad582a5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d1bb74fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55704278e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #59: + 0x150582 (0x5570427a9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55ae1865c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #26: PyObject_Call + 0xbc (0x55d1bb75bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #14: + 0x5ae6fa5 (0x7fdf3ce47fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: frame #15: + 0x5124446 (0x7fdf3c485446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #57: + 0x150582 (0x55ae18675582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #16: + 0x1acf4b8 (0x7fdf38e304b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: frame #38: + 0x211239 (0x564695bde239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #60: PyObject_Call + 0xbc (0x5570427a9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5570427902b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #62: + 0x150582 (0x5570427a9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #17: + 0x5aee004 (0x7fdf3ce4f004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #18: + 0x5af36b5 (0x7fdf3ce546b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55ae1865a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #59: + 0x150582 (0x55ae18675582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #39: _PyObject_MakeTpCall + 0x26b (0x564695b0aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #63: PyObject_Call + 0xbc (0x5570427a9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x564695b063e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: pipeline_state.run_communication() -[default5]:[rank53]: frame #60: PyObject_Call + 0xbc (0x55ae18675f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: frame #41: _PyFunction_Vectorcall + 0x6c (0x564695b11a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #19: + 0xd2631e (0x7fdf4fa3e31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #20: + 0x47def4 (0x7fdf4f195ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55ae1865c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1bb7422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55ad582968fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: frame #30: + 0x150582 (0x55ad582b1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #62: + 0x150582 (0x55ae18675582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55ad582968fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #63: PyObject_Call + 0xbc (0x55ae18675f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #21: + 0x1445a6 (0x5584f08c25a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5584f08bba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: frame #32: + 0x150582 (0x55ad582b1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55ad582968fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: frame #23: + 0x150866 (0x5584f08ce866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5584f08b7142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: frame #34: + 0x150582 (0x55ad582b1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55ad582968fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank42]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5584f08c2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: dist.recv( -[default2]:[rank42]: frame #26: PyObject_Call + 0xbc (0x5584f08cef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank41]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55ad5829df50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: dist.recv( -[default1]:[rank41]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55ad582afc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5584f08b52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5584f08c2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank41]: frame #38: + 0x211239 (0x55ad58372239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x564695b01c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5584f08b38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return func(*args, **kwargs) -[default2]:[rank42]: frame #30: + 0x150582 (0x5584f08ce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: frame #43: _PyFunction_Vectorcall + 0x6c (0x564695b11a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x564695b028fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5584f08b38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #32: + 0x150582 (0x5584f08ce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank46]: frame #45: + 0x150582 (0x564695b1d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55ad5829ea6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5584f08b38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f38bf8f2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #34: + 0x150582 (0x5584f08ce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55ad5829a3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #1: + 0x5b3a23e (0x7f38f940f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55ad582a5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank51]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf6b223897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5584f08b38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f38f9409c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d1bb74fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55ad58295c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55ad582a5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #1: + 0x5b3a23e (0x7fdfa4d4023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #46: PyObject_Call + 0xbc (0x564695b1df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f38f9409f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5584f08baf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5584f08ccc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f38f940afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fdfa4d3ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d1bb7408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f38f93bf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f38f93bf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #30: + 0x150582 (0x55d1bb75b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fdfa4d3af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x564695b042b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #38: + 0x211239 (0x5584f098f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5584f08bba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fdfa4d3bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f38f93bf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f38f93bf371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f38c0bcc189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d1bb7408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f38c0bd3610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f38c0bf2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #32: + 0x150582 (0x55d1bb75b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5584f08b73e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5584f08c2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #12: + 0x5adc309 (0x7f38f93b1309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5584f08b2c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #13: + 0x5ae6f10 (0x7f38f93bbf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5584f08c2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdfa4cf0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdfa4cf0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d1bb7408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdfa4cf0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #34: + 0x150582 (0x55d1bb75b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #14: + 0x5ae6fa5 (0x7f38f93bbfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5584f08b38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #45: + 0x150582 (0x5584f08ce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fdfa4cf0371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #48: + 0x150582 (0x564695b1d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #15: + 0x5124446 (0x7f38f89f9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #49: PyObject_Call + 0xbc (0x564695b1df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #16: + 0x1acf4b8 (0x7f38f53a44b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fdf6c4fd189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #46: PyObject_Call + 0xbc (0x5584f08cef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #17: + 0x5aee004 (0x7f38f93c3004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #18: + 0x5af36b5 (0x7f38f93c86b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d1bb7408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fdf6c504610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d1bb747f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #19: + 0xd2631e (0x7f390bfb231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank49]: frame #20: + 0x47def4 (0x7f390b709ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5584f08b52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #21: + 0x1445a6 (0x563565d645a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #48: + 0x150582 (0x5584f08ce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fdf6c523978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank45]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d1bb759c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563565d5da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #23: + 0x150866 (0x563565d70866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #38: + 0x211239 (0x55d1bb81c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563565d59142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #49: PyObject_Call + 0xbc (0x5584f08cef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5584f08b52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563565d64a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #26: PyObject_Call + 0xbc (0x563565d70f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x564695b042b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #12: + 0x5adc309 (0x7fdfa4ce2309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d1bb748a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563565d572b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55ad582968fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #45: + 0x150582 (0x55ad582b1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563565d64a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #13: + 0x5ae6f10 (0x7fdfa4cecf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d1bb7443e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5584f08c2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5584f08bb007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563565d558fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #51: _PyFunction_Vectorcall + 0x6c (0x564695b11a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #30: + 0x150582 (0x563565d70582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5584f08ccc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #54: + 0x211239 (0x5584f098f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #14: + 0x5ae6fa5 (0x7fdfa4cecfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x564695b0a007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563565d558fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #53: _PyObject_Call_Prepend + 0x69 (0x564695b1bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #32: + 0x150582 (0x563565d70582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #15: + 0x5124446 (0x7fdfa432a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #55: PyObject_Call + 0x207 (0x5584f08cf067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563565d558fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5584f08b52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #16: + 0x1acf4b8 (0x7fdfa0cd54b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #57: + 0x150582 (0x5584f08ce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #17: + 0x5aee004 (0x7fdfa4cf4004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #34: + 0x150582 (0x563565d70582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563565d558fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563565d5cf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d1bb74fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #18: + 0x5af36b5 (0x7fdfa4cf96b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d1bb73fc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #54: + 0x211239 (0x564695bde239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5584f08b38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #59: + 0x150582 (0x5584f08ce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #19: + 0xd2631e (0x7fdfb78e331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank41]: frame #46: PyObject_Call + 0xbc (0x55ad582b1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #20: + 0x47def4 (0x7fdfb703aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #21: + 0x1445a6 (0x55948b9b05a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55ad582982b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55948b9a9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d1bb74fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563565d6ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #60: PyObject_Call + 0xbc (0x5584f08cef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #38: + 0x211239 (0x563565e31239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #23: + 0x150866 (0x55948b9bc866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5584f08b52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563565d5da6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d1bb7408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55948b9a5142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #55: PyObject_Call + 0x207 (0x564695b1e067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563565d593e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #45: + 0x150582 (0x55d1bb75b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55948b9b0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #62: + 0x150582 (0x5584f08ce582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #63: PyObject_Call + 0xbc (0x5584f08cef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563565d64a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #46: PyObject_Call + 0xbc (0x55d1bb75bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #26: PyObject_Call + 0xbc (0x55948b9bcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1bb7422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #48: + 0x150582 (0x55ad582b1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank49]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563565d54c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x564695b042b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563565d64a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55948b9a32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #57: + 0x150582 (0x564695b1d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x564695b028fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563565d558fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #49: PyObject_Call + 0xbc (0x55ad582b1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #59: + 0x150582 (0x564695b1d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55948b9b0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #48: + 0x150582 (0x55d1bb75b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55948b9a18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #30: + 0x150582 (0x55948b9bc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #60: PyObject_Call + 0xbc (0x564695b1df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x564695b042b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #45: + 0x150582 (0x563565d70582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55ad582982b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55948b9a18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #49: PyObject_Call + 0xbc (0x55d1bb75bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #32: + 0x150582 (0x55948b9bc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #62: + 0x150582 (0x564695b1d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55ad582a5a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55948b9a18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #34: + 0x150582 (0x55948b9bc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #63: PyObject_Call + 0xbc (0x564695b1df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #46: PyObject_Call + 0xbc (0x563565d70f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1bb7422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55948b9a18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55ad5829e007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55948b9a8f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d1bb74fa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55948b9bac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #38: + 0x211239 (0x55948ba7d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55ad582afc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #54: + 0x211239 (0x55ad58372239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55948b9a9a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563565d572b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #48: + 0x150582 (0x563565d70582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #49: PyObject_Call + 0xbc (0x563565d70f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #55: PyObject_Call + 0x207 (0x55ad582b2067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55948b9a53e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d1bb748007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55948b9b0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55948b9a0c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d1bb759c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55ad582982b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55948b9b0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #54: + 0x211239 (0x55d1bb81c239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563565d572b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #55: PyObject_Call + 0x207 (0x55d1bb75c067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1bb7422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55948b9a18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #57: + 0x150582 (0x55d1bb75b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563565d64a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d1bb7408fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563565d5d007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #45: + 0x150582 (0x55948b9bc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #57: + 0x150582 (0x55ad582b1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563565d6ec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #59: + 0x150582 (0x55d1bb75b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #54: + 0x211239 (0x563565e31239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55ad582968fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #46: PyObject_Call + 0xbc (0x55948b9bcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #60: PyObject_Call + 0xbc (0x55d1bb75bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #55: PyObject_Call + 0x207 (0x563565d71067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563565d572b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #57: + 0x150582 (0x563565d70582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #59: + 0x150582 (0x55ad582b1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55948b9a32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #60: PyObject_Call + 0xbc (0x55ad582b1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #48: + 0x150582 (0x55948b9bc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563565d558fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55ad582982b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #62: + 0x150582 (0x55ad582b1582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #49: PyObject_Call + 0xbc (0x55948b9bcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #63: PyObject_Call + 0xbc (0x55ad582b1f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55948b9a32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55948b9b0a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1bb7422b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55948b9a9007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank49]: frame #59: + 0x150582 (0x563565d70582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #62: + 0x150582 (0x55d1bb75b582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #63: PyObject_Call + 0xbc (0x55d1bb75bf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #60: PyObject_Call + 0xbc (0x563565d70f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55948b9bac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank49]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563565d572b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #54: + 0x211239 (0x55948ba7d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #55: PyObject_Call + 0x207 (0x55948b9bd067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55948b9a32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #57: + 0x150582 (0x55948b9bc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55948b9a18fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #59: + 0x150582 (0x55948b9bc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #60: PyObject_Call + 0xbc (0x55948b9bcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55948b9a32b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #62: + 0x150582 (0x563565d70582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #62: + 0x150582 (0x55948b9bc582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #63: PyObject_Call + 0xbc (0x563565d70f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #63: PyObject_Call + 0xbc (0x55948b9bcf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: . This may indicate a possible application crash on rank 0 or a network set up issue. -W0703 00:43:38.035000 139802089281344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 863686 closing signal SIGTERM -W0703 00:43:38.035000 139802089281344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 863687 closing signal SIGTERM -W0703 00:43:38.035000 139802089281344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 863689 closing signal SIGTERM -W0703 00:43:38.035000 139802089281344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 863690 closing signal SIGTERM -E0703 00:43:38.161000 140584891750208 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1772663) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:43:38 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1772664) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:43:38 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1772665) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:43:38 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1772666) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_00:43:38 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1772667) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_00:43:38 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1772668) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_00:43:38 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1772669) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_00:43:38 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1772670) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:43:38 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1772663) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -E0703 00:43:39.168000 139802089281344 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 863684) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:43:39.175000 139802089281344 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_863611_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:39.203000 139802089281344 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_863611_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:39.219000 139802089281344 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_863611_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:43:38 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 863685) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:43:38 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 863688) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:43:38 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 863691) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:43:38 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 863684) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -W0703 00:43:42.036000 139700073875200 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3759110_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:42.601000 140707287561984 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-153.ec2.internal_1414869_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:42.888000 139632414758656 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-162-233.ec2.internal_1394219_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:42.935000 140582689257216 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3887957_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:42.997000 139707802011392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_1136238_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.007000 140504039540480 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_876959_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.038000 140712948295488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1414946 closing signal SIGTERM -W0703 00:43:43.038000 140712948295488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1414950 closing signal SIGTERM -W0703 00:43:43.042000 139713462744896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1136311 closing signal SIGTERM -W0703 00:43:43.042000 139713462744896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1136313 closing signal SIGTERM -W0703 00:43:43.042000 139713462744896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1136315 closing signal SIGTERM -W0703 00:43:43.042000 139713462744896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1136317 closing signal SIGTERM -W0703 00:43:43.043000 139638075492160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1394292 closing signal SIGTERM -W0703 00:43:43.043000 139638075492160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1394293 closing signal SIGTERM -W0703 00:43:43.044000 139638075492160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1394294 closing signal SIGTERM -W0703 00:43:43.044000 139638075492160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1394295 closing signal SIGTERM -W0703 00:43:43.044000 139638075492160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1394296 closing signal SIGTERM -W0703 00:43:43.045000 139638075492160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1394297 closing signal SIGTERM -W0703 00:43:43.045000 139638075492160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1394298 closing signal SIGTERM -W0703 00:43:43.045000 139638075492160 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1394299 closing signal SIGTERM -W0703 00:43:43.044000 139705734608704 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3759183 closing signal SIGTERM -W0703 00:43:43.044000 139705734608704 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3759185 closing signal SIGTERM -W0703 00:43:43.044000 139705734608704 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3759186 closing signal SIGTERM -W0703 00:43:43.045000 139705734608704 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3759188 closing signal SIGTERM -W0703 00:43:43.045000 139705734608704 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3759189 closing signal SIGTERM -W0703 00:43:43.045000 139705734608704 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3759190 closing signal SIGTERM -W0703 00:43:43.045000 140588349990720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3888035 closing signal SIGTERM -W0703 00:43:43.045000 140588349990720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3888036 closing signal SIGTERM -W0703 00:43:43.045000 140588349990720 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3888037 closing signal SIGTERM -E0703 00:43:43.167000 140509700273984 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 877032) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:43:43.174000 140509700273984 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_876959_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.201000 140509700273984 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_876959_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.231000 140509700273984 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_876959_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 877033) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 877034) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 877035) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 877036) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 877037) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 877038) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 877039) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 877032) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 00:43:43.473000 140588349990720 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3888030) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -W0703 00:43:43.479000 140588349990720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3887957_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.510000 140588349990720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3887957_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.529000 140588349990720 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3887957_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 3888031) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-62.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 3888032) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 3888033) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 3888034) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 3888030) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 00:43:43.572000 140712948295488 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1414943) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:43:43.577000 140712948295488 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1414869_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.605000 140712948295488 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1414869_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.628000 140712948295488 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1414869_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-153.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 1414944) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-153.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 1414945) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 1414947) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-153.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 1414948) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-153.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 1414949) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 1414943) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 00:43:43.671000 139705734608704 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 3759184) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:43:43.677000 139705734608704 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3759110_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.704000 139705734608704 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3759110_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:43.715000 139705734608704 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3759110_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-102.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 3759187) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:43:43 - host : ip-26-0-171-102.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 3759184) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -E0703 00:43:44.070000 139713462744896 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 1136312) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:43:44.076000 139713462744896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1136238_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:44.103000 139713462744896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1136238_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:44.120000 139713462744896 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1136238_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1136314) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1136316) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1136318) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:43:43 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1136312) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -W0703 00:43:44.181000 139638075492160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1394219_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:43:44.195000 139638075492160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1394219_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-162-233: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-256/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/bench.slurm deleted file mode 100644 index ab63767bae196087d15626951cead80247334050..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/config.yaml deleted file mode 100644 index ade8c7738f29813a48c53bac0c89ef55961e7fb8..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 16 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 32 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/log.out deleted file mode 100644 index 0121c2149f5a2d718e31a0adf595af26304628d3..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/log.out +++ /dev/null @@ -1,3364 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:55:36 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:55:41.618000 140574387312448 torch/distributed/run.py:757] -W0703 09:55:41.618000 140574387312448 torch/distributed/run.py:757] ***************************************** -W0703 09:55:41.618000 140574387312448 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:55:41.618000 140574387312448 torch/distributed/run.py:757] ***************************************** -W0703 09:55:41.649000 140345590765376 torch/distributed/run.py:757] -W0703 09:55:41.649000 140345590765376 torch/distributed/run.py:757] ***************************************** -W0703 09:55:41.649000 140345590765376 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:55:41.649000 140345590765376 torch/distributed/run.py:757] ***************************************** -W0703 09:55:41.674000 140339776939840 torch/distributed/run.py:757] -W0703 09:55:41.674000 140339776939840 torch/distributed/run.py:757] ***************************************** -W0703 09:55:41.674000 140339776939840 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:55:41.674000 140339776939840 torch/distributed/run.py:757] ***************************************** -W0703 09:55:41.679000 139738299811648 torch/distributed/run.py:757] -W0703 09:55:41.679000 139738299811648 torch/distributed/run.py:757] ***************************************** -W0703 09:55:41.679000 139738299811648 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:55:41.679000 139738299811648 torch/distributed/run.py:757] ***************************************** -W0703 09:55:41.744000 139954007369536 torch/distributed/run.py:757] -W0703 09:55:41.744000 139954007369536 torch/distributed/run.py:757] ***************************************** -W0703 09:55:41.744000 139954007369536 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:55:41.744000 139954007369536 torch/distributed/run.py:757] ***************************************** -W0703 09:55:42.097000 139999195637568 torch/distributed/run.py:757] -W0703 09:55:42.097000 139999195637568 torch/distributed/run.py:757] ***************************************** -W0703 09:55:42.097000 139999195637568 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:55:42.097000 139999195637568 torch/distributed/run.py:757] ***************************************** -W0703 09:55:42.255000 139801600980800 torch/distributed/run.py:757] -W0703 09:55:42.255000 139801600980800 torch/distributed/run.py:757] ***************************************** -W0703 09:55:42.255000 139801600980800 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:55:42.255000 139801600980800 torch/distributed/run.py:757] ***************************************** -W0703 09:55:42.833000 140360635504448 torch/distributed/run.py:757] -W0703 09:55:42.833000 140360635504448 torch/distributed/run.py:757] ***************************************** -W0703 09:55:42.833000 140360635504448 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:55:42.833000 140360635504448 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:56:07 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=2, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=16, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=32, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=16, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32')), -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 09:56:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default6]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=6|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=1|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=5|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=1|ip-26-0-169-132]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=1|ip-26-0-169-132]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=1|ip-26-0-169-132]: No checkpoint path provided. -[default3]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=11|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=8|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=9|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=3|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=2|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=4|ip-26-0-169-132]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=4|ip-26-0-169-132]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=4|ip-26-0-169-132]: No checkpoint path provided. -[default4]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=4|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=2|ip-26-0-169-132]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=2|ip-26-0-169-132]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=2|ip-26-0-169-132]: No checkpoint path provided. -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=5|ip-26-0-169-132]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=5|ip-26-0-169-132]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=5|ip-26-0-169-132]: No checkpoint path provided. -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=6|ip-26-0-169-132]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=6|ip-26-0-169-132]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=6|ip-26-0-169-132]: No checkpoint path provided. -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=0|ip-26-0-169-132]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=0|ip-26-0-169-132]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=0|ip-26-0-169-132]: No checkpoint path provided. -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=3|ip-26-0-169-132]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=3|ip-26-0-169-132]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=3|ip-26-0-169-132]: No checkpoint path provided. -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=10|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=14|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=11|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=4|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=0|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=9|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=13|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=13|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=10|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=8|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=12|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=14|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=7|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=7|ip-26-0-169-132]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=7|ip-26-0-169-132]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=7|ip-26-0-169-132]: No checkpoint path provided. -[default1]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=1|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=15|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=3|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=6|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=7|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=2|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=12|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 09:56:24 [INFO|DP=1|PP=0|TP=15|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 09:56:24 [INFO|DP=1|PP=1|TP=5|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 09:56:25 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 09:56:25 [INFO|DP=0|PP=1|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 09:56:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:56:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:56:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 09:56:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 09:56:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:56:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 09:56:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:56:27 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:56:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:56:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:56:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 09:56:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 09:56:29.274166 | mbs: 32 | grad_accum: 16 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:56:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:56:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default1]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=9|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=3|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=10|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=14|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=15|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=1|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=6|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=12|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=1|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=5|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=2|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=11|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=8|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=4|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=4|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=5|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=2|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=9|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=11|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=14|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=0|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=8|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=4|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=10|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=13|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=7|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=3|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=2|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=5|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=1|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=0|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=6|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=13|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=12|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=7|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:56:29 [WARNING|DP=1|PP=1|TP=7|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=15|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:56:29 [WARNING|DP=1|PP=0|TP=6|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:56:29 [WARNING|DP=0|PP=1|TP=3|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:56:29 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:56:34 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default3]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default3]:07/03/2024 09:56:40 [WARNING|DP=0|PP=1|TP=3|ip-26-0-169-132]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default3]:07/03/2024 09:56:40 [WARNING|DP=0|PP=1|TP=3|ip-26-0-169-132]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: torch.autograd.backward( -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: return user_fn(self, *args) -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default6]:[rank6]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank6]: dist.recv( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbfa0032897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbfa130bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbfa1310a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbfa1311dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fbfecdaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fbff1df1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fbff1bbc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbfa0032897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbfa130bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbfa1310a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbfa1311dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fbfecdaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fbff1df1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fbff1bbc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbfa0032897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fbfa0f95119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fbfecdaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fbff1df1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fbff1bbc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd4c4967897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd4c5c40c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f65dd298897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd4c5c45a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd4c5c46dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd5116dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f65de571c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7fd516726609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd5164f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f65de576a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f65de577dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f662a010e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f662f057609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f662ee22353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd4c4967897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd4c5c40c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd4c5c45a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd4c5c46dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd5116dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #5: + 0x8609 (0x7fd516726609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd5164f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f65dd298897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f65de571c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd4c4967897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f65de576a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #1: + 0xe32119 (0x7fd4c58ca119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f65de577dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f662a010e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #2: + 0xd3e95 (0x7fd5116dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f662f057609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #3: + 0x8609 (0x7fd516726609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fd5164f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:frame #6: clone + 0x43 (0x7f662ee22353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f65dd298897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f65de1fb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f662a010e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f662f057609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f662ee22353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: trainer.train(dataloader) -[default4]:[rank36]: output = model(**micro_batch) -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default4]:[rank36]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default2]:[rank34]: sharded_logits = self.model( -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: sharded_logits = self.model( -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default4]:[rank36]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: return func(*args, **kwargs) -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]: dist.recv( -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7cbec27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7cbff00c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7cbff05a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7cbff06dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7d0b99fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f7d109e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7d107b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7cbec27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7cbff00c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7cbff05a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7cbff06dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7d0b99fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f7d109e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7d107b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7cbec27897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f7cbfb8a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f7d0b99fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f7d109e6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f7d107b1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc884e2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc897bbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc897c0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc897c1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7efcd525ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7efcda2a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7efcda06c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc884e2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efc897bbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efc897c0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efc897c1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7efcd525ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7efcda2a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7efcda06c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efc884e2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7efc89445119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7efcd525ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7efcda2a1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7efcda06c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8cf6321897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8cf75fac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8cf75ffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8cf7600dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8d43099e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8d480e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8d47eab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8cf6321897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8cf75fac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8cf75ffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8cf7600dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f8d43099e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f8d480e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f8d47eab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3feba0b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8cf6321897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fecce4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fecce9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fecceadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4038783e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f403d7ca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #1: + 0xe32119 (0x7f8cf7284119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #6: clone + 0x43 (0x7f403d595353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default2]:frame #2: + 0xd3e95 (0x7f8d43099e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f8d480e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f8d47eab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3feba0b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3fecce4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3fecce9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3fecceadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4038783e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f403d7ca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f403d595353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3feba0b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3fec96e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f4038783e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f403d7ca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f403d595353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f18422897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f196fbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f19700a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f19701dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5f6519ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5f6a1e1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f5f69fac353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f18422897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f196fbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f19700a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f19701dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f5f6519ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f5f6a1e1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:frame #6: clone + 0x43 (0x7f5f69fac353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f18422897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:frame #1: + 0xe32119 (0x7f5f19385119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f5f6519ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f5f6a1e1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f5f69fac353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3776253897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f377752cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3777531a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3777532dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f37c2fcbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f37c8012609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f37c7ddd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3776253897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f377752cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3777531a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3777532dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f37c2fcbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f37c8012609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f37c7ddd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3776253897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f37771b6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f37c2fcbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f37c8012609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f37c7ddd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa91c95f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa91dc38c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa91dc3da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa91dc3edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fa9696d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:frame #5: + 0x8609 (0x7fa96e71e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:frame #6: clone + 0x43 (0x7fa96e4e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default5]: -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa91c95f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa91dc38c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa91dc3da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa91dc3edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default5]:frame #4: + 0xd3e95 (0x7fa9696d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fa96e71e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fa96e4e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa91c95f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fa91d8c2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:frame #2: + 0xd3e95 (0x7fa9696d7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default5]:frame #3: + 0x8609 (0x7fa96e71e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fa96e4e9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6527321897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f65285fac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f65285ffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6528600dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f6574099e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f65790e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f6578eab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6527321897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f65285fac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f65285ffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6528600dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f6574099e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f65790e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f6578eab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6527321897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f6528284119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f6574099e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f65790e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f6578eab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d8152d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d82806c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d8280ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d8280cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2dce2a5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2dd32ec609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2dd30b7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d8152d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d82806c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d8280ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d8280cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2dce2a5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f2dd32ec609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2dd30b7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d8152d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f2d82490119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f2dce2a5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f2dd32ec609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f2dd30b7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3af35ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa96d025897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3af48a3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa96e2fec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa96e303a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa96e304dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3af48a8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa9b9d9de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa9bede4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3af48a9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #6: clone + 0x43 (0x7fa9bebaf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #4: + 0xd3e95 (0x7f3b40342e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3b45389609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa96d025897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #6: clone + 0x43 (0x7f3b45154353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa96e2fec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa96e303a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa96e304dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa9b9d9de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa9bede4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa9bebaf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa96d025897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fa96df88119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3af35ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3af48a3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3af48a8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3af48a9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f3b40342e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f3b45389609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f3b45154353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #2: + 0xd3e95 (0x7fa9b9d9de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]: -[default2]:frame #3: + 0x8609 (0x7fa9bede4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3af35ca897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #4: clone + 0x43 (0x7fa9bebaf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:frame #1: + 0xe32119 (0x7f3af452d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f3b40342e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f3b45389609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f3b45154353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9ef9aa1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24f3d8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9efad7ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24f5063c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9efad7fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24f5068a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9efad80dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24f5069dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f2540b02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #4: + 0xd3e95 (0x7f9f46819e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f9f4b860609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #5: + 0x8609 (0x7f2545b49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2545914353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #6: clone + 0x43 (0x7f9f4b62b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24f3d8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24f5063c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24f5068a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24f5069dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default4]:frame #4: + 0xd3e95 (0x7f2540b02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9ef9aa1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #5: + 0x8609 (0x7f2545b49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9efad7ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9efad7fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9efad80dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9f46819e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f9f4b860609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f2545914353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:frame #6: clone + 0x43 (0x7f9f4b62b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9ef9aa1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f9efaa04119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f24f3d8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f24f4ced119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f9f46819e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f9f4b860609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f9f4b62b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:frame #2: + 0xd3e95 (0x7f2540b02e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f2545b49609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f2545914353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3344165897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f334543ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3345443a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3345444dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f3390edde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3395f24609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3395cef353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3344165897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f334543ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3345443a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3345444dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f3390edde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3395f24609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3395cef353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3344165897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f33450c8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f3390edde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f3395f24609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f3395cef353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd25127e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd252557c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd25255ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd25255ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fd29dff6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fd2a303d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fd2a2e08353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd25127e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd252557c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd25255ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd25255ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fd29dff6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fd2a303d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fd2a2e08353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd25127e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fd2521e1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fd29dff6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fd2a303d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fd2a2e08353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2e4962897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa2e5c3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa2e5c40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa2e5c41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa3316dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa336721609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa3364ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2e4962897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa2e5c3bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa2e5c40a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa2e5c41dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa3316dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa336721609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa3364ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa2e4962897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fa2e58c5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fa3316dae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fa336721609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fa3364ec353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbb70f4a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbb72223c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbb72228a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbb72229dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fbbbdcc2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fbbc2d09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fbbc2ad4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbb70f4a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fbb72223c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fbb72228a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fbb72229dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fbbbdcc2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fbbc2d09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fbbc2ad4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbb70f4a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fbb71ead119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fbbbdcc2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fbbc2d09609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fbbc2ad4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank10]: Traceback (most recent call last): -[default0]:[rank8]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: _engine_run_backward( -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8608b57897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8609e30c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8609e35a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8609e36dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f86558cfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f865a916609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f865a6e1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8608b57897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8609e30c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8609e35a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8609e36dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f86558cfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f865a916609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f865a6e1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8608b57897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f8609aba119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f86558cfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f865a916609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f865a6e1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f02f44897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f0421dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f04222a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f04223dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2f4fcbce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f2f54d03609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f2f54ace353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f02f44897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2f0421dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2f04222a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2f04223dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f2f4fcbce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f2f54d03609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f2f54ace353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2f02f44897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f2f03ea7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f2f4fcbce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f2f54d03609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f2f54ace353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3aee155897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3aef42ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3aef433a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3aef434dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f3b3aecde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3b3ff14609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3b3fcdf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3aee155897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3aef42ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3aef433a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3aef434dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f3b3aecde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f3b3ff14609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f3b3fcdf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3aee155897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f3aef0b8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f3b3aecde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f3b3ff14609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f3b3fcdf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2461606897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24628dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24628e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24628e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f24ae37ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f24b33c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f24b3190353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2461606897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24628dfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24628e4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24628e5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f24ae37ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f24b33c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f24b3190353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2461606897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f2462569119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f24ae37ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f24b33c5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f24b3190353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1197913897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1198becc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1198bf1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1198bf2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f11e468be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f11e96d2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f11e949d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1197913897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1198becc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1198bf1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1198bf2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f11e468be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f11e96d2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f11e949d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1197913897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f1198876119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f11e468be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f11e96d2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f11e949d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41bdd24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f41beffdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f41bf002a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f41bf003dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f420aa9ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f420fae3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f420f8ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41bdd24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f41beffdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f41bf002a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f41bf003dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f420aa9ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f420fae3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f420f8ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f41bdd24897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f41bec87119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f420aa9ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f420fae3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f420f8ae353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6520c36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6521f0fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6521f14a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6521f15dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f656d9aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f65729f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f65727c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6520c36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6521f0fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6521f14a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6521f15dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f656d9aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f65729f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f65727c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6520c36897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f6521b99119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f656d9aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f65729f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f65727c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa5defda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa5e02b3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa5e02b8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa5e02b9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa62bd52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa630d99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa630b64353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa5defda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa5e02b3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa5e02b8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa5e02b9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa62bd52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa630d99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa630b64353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa5defda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fa5dff3d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fa62bd52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fa630d99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fa630b64353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ea05a1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3ea187ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3ea187fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3ea1880dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3eed319e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3ef2360609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3ef212b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ea05a1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3ea187ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3ea187fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3ea1880dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f3eed319e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f3ef2360609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f3ef212b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ea05a1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f3ea1504119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f3eed319e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f3ef2360609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f3ef212b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf81841897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf82b1ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf82b1fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf82b20dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fdfce5b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fdfd3600609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fdfd33cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16777216, NumelOut=16777216, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf81841897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf82b1ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf82b1fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf82b20dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fdfce5b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fdfd3600609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fdfd33cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf81841897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fdf827a4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fdfce5b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fdfd3600609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fdfd33cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa687f00897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6891d9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6891dea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6891dfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa6d4c78e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa6d9cbf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa6d9a8a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa687f00897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa6891d9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa6891dea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa6891dfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fa6d4c78e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fa6d9cbf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fa6d9a8a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa687f00897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fa688e63119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fa6d4c78e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fa6d9cbf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fa6d9a8a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbd1667897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efbd2940c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efbd2945a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efbd2946dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7efc1e3dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7efc23426609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7efc231f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=131072, NumelOut=131072, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbd1667897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efbd2940c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efbd2945a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efbd2946dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7efc1e3dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7efc23426609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7efc231f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efbd1667897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7efbd25ca119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7efc1e3dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7efc23426609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7efc231f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -W0703 10:06:54.539000 139738299811648 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1214099 closing signal SIGTERM -W0703 10:06:54.540000 139738299811648 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1214100 closing signal SIGTERM -W0703 10:06:54.540000 139738299811648 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1214102 closing signal SIGTERM -W0703 10:06:54.540000 139738299811648 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1214103 closing signal SIGTERM -E0703 10:06:56.005000 139738299811648 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 1214101) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:06:54 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1214104) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1214104 -[2]: - time : 2024-07-03_10:06:54 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1214105) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1214105 -[3]: - time : 2024-07-03_10:06:54 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1214106) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1214106 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:06:54 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1214101) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1214101 -============================================================ -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -W0703 10:06:58.473000 140354974770944 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1934408_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:06:58.810000 139993534904064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-220.ec2.internal_845076_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:06:58.982000 139795940247296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3291694_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:06:59.153000 140339930031872 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1907031_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:06:59.245000 140334116206336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-132.ec2.internal_2478027_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:06:59.286000 139948346636032 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_977167_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:06:59.351000 140568726578944 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1135792_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:06:59.545000 140574387312448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1135865 closing signal SIGTERM -W0703 10:06:59.545000 140574387312448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1135866 closing signal SIGTERM -W0703 10:06:59.545000 140574387312448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1135867 closing signal SIGTERM -W0703 10:06:59.545000 140574387312448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1135868 closing signal SIGTERM -W0703 10:06:59.546000 140574387312448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1135869 closing signal SIGTERM -W0703 10:06:59.548000 140574387312448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1135870 closing signal SIGTERM -W0703 10:06:59.549000 140574387312448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1135871 closing signal SIGTERM -W0703 10:06:59.549000 140574387312448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1135872 closing signal SIGTERM -W0703 10:06:59.587000 139801600980800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3291767 closing signal SIGTERM -W0703 10:06:59.588000 139801600980800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3291768 closing signal SIGTERM -W0703 10:06:59.588000 139801600980800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3291769 closing signal SIGTERM -W0703 10:06:59.588000 139801600980800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3291770 closing signal SIGTERM -W0703 10:06:59.591000 139801600980800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3291771 closing signal SIGTERM -W0703 10:06:59.591000 140339776939840 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2478102 closing signal SIGTERM -W0703 10:06:59.592000 140339776939840 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2478104 closing signal SIGTERM -W0703 10:06:59.591000 139801600980800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3291772 closing signal SIGTERM -W0703 10:06:59.591000 139801600980800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3291773 closing signal SIGTERM -W0703 10:06:59.596000 139801600980800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3291774 closing signal SIGTERM -W0703 10:06:59.600000 139999195637568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 845152 closing signal SIGTERM -W0703 10:06:59.600000 139999195637568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 845154 closing signal SIGTERM -W0703 10:06:59.600000 139999195637568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 845155 closing signal SIGTERM -W0703 10:06:59.611000 139954007369536 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 977240 closing signal SIGTERM -W0703 10:06:59.612000 139954007369536 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 977241 closing signal SIGTERM -W0703 10:06:59.612000 139954007369536 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 977242 closing signal SIGTERM -W0703 10:06:59.614000 139954007369536 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 977243 closing signal SIGTERM -W0703 10:06:59.614000 139954007369536 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 977244 closing signal SIGTERM -W0703 10:06:59.615000 139954007369536 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 977245 closing signal SIGTERM -W0703 10:06:59.617000 139954007369536 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 977246 closing signal SIGTERM -W0703 10:06:59.617000 139954007369536 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 977247 closing signal SIGTERM -W0703 10:06:59.654000 140360635504448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1934480 closing signal SIGTERM -W0703 10:06:59.654000 140360635504448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1934481 closing signal SIGTERM -W0703 10:06:59.654000 140360635504448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1934482 closing signal SIGTERM -W0703 10:06:59.654000 140360635504448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1934483 closing signal SIGTERM -W0703 10:06:59.654000 140360635504448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1934484 closing signal SIGTERM -W0703 10:06:59.656000 140360635504448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1934485 closing signal SIGTERM -W0703 10:06:59.656000 140360635504448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1934486 closing signal SIGTERM -W0703 10:06:59.659000 140360635504448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1934487 closing signal SIGTERM -E0703 10:06:59.789000 140345590765376 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1907104) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:06:59.801000 140345590765376 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1907031_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:06:59.831000 140345590765376 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1907031_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:06:59.865000 140345590765376 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1907031_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 1907105) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1907105 -[2]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 1907106) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1907106 -[3]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 1907107) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1907107 -[4]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 1907108) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1907108 -[5]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 1907109) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1907109 -[6]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 1907110) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1907110 -[7]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 1907111) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1907111 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 1907104) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1907104 -============================================================ -E0703 10:07:00.198000 140339776939840 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 2478101) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:07:00.211000 140339776939840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-132.ec2.internal_2478027_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:00.242000 140339776939840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-132.ec2.internal_2478027_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:00.264000 140339776939840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-132.ec2.internal_2478027_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-132.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 2478103) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2478103 -[2]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-132.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 2478105) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2478105 -[3]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-132.ec2.internal - rank : 37 (local_rank: 5) - exitcode : -6 (pid: 2478106) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2478106 -[4]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-132.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 2478107) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2478107 -[5]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-132.ec2.internal - rank : 39 (local_rank: 7) - exitcode : -6 (pid: 2478108) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2478108 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:06:59 - host : ip-26-0-169-132.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 2478101) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2478101 -============================================================ -E0703 10:07:00.485000 139999195637568 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 845149) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:07:00.498000 139999195637568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_845076_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-169-86: task 4: Exited with exit code 1 -W0703 10:07:00.527000 139999195637568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_845076_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:00.546000 139999195637568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_845076_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:06:59 - host : ip-26-0-163-220.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 845150) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 845150 -[2]: - time : 2024-07-03_10:06:59 - host : ip-26-0-163-220.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 845151) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 845151 -[3]: - time : 2024-07-03_10:06:59 - host : ip-26-0-163-220.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 845153) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 845153 -[4]: - time : 2024-07-03_10:06:59 - host : ip-26-0-163-220.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 845156) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 845156 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:06:59 - host : ip-26-0-163-220.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 845149) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 845149 -============================================================ -srun: error: ip-26-0-169-132: task 5: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 1: Exited with exit code 1 -W0703 10:07:03.478000 140354974770944 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1934408_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:03.987000 139795940247296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3291694_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:04.291000 139948346636032 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_977167_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:04.355000 140568726578944 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1135792_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:08.093000 139954007369536 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_977167_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:08.108000 139954007369536 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_977167_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 10:07:08.482000 140354974770944 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1934408_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -W0703 10:07:08.991000 139795940247296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3291694_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:09.359000 140568726578944 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1135792_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:11.708000 140574387312448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1135792_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:11.723000 140574387312448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1135792_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 10:07:12.322000 139801600980800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3291694_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:12.339000 139801600980800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3291694_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -W0703 10:07:13.491000 140360635504448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1934408_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:07:13.509000 140360635504448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1934408_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-168-238: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-32/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/bench.slurm deleted file mode 100644 index 21c6056ace2f3d89bbe1112598f04d44a8202e45..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/config.yaml deleted file mode 100644 index c6c1e8f2fc33723dac26192228522819b4a9333d..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 128 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 4 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/log.out deleted file mode 100644 index 1ef4359484449ae7ca9e7c40601a3d8a5a41d205..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/log.out +++ /dev/null @@ -1,5747 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:48:50 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:48:55.178000 139720356824896 torch/distributed/run.py:757] -W0703 09:48:55.178000 139720356824896 torch/distributed/run.py:757] ***************************************** -W0703 09:48:55.178000 139720356824896 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:48:55.178000 139720356824896 torch/distributed/run.py:757] ***************************************** -W0703 09:48:55.612000 140149609514816 torch/distributed/run.py:757] -W0703 09:48:55.612000 140149609514816 torch/distributed/run.py:757] ***************************************** -W0703 09:48:55.612000 140149609514816 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:48:55.612000 140149609514816 torch/distributed/run.py:757] ***************************************** -W0703 09:48:55.720000 139756129400640 torch/distributed/run.py:757] -W0703 09:48:55.720000 139756129400640 torch/distributed/run.py:757] ***************************************** -W0703 09:48:55.720000 139756129400640 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:48:55.720000 139756129400640 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.039000 140054861379392 torch/distributed/run.py:757] -W0703 09:48:56.039000 140054861379392 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.039000 140054861379392 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:48:56.039000 140054861379392 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.066000 139723817133888 torch/distributed/run.py:757] -W0703 09:48:56.066000 139723817133888 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.066000 139723817133888 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:48:56.066000 139723817133888 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.130000 139781130622784 torch/distributed/run.py:757] -W0703 09:48:56.130000 139781130622784 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.130000 139781130622784 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:48:56.130000 139781130622784 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.147000 140403810649920 torch/distributed/run.py:757] -W0703 09:48:56.147000 140403810649920 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.147000 140403810649920 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:48:56.147000 140403810649920 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.405000 140290787850048 torch/distributed/run.py:757] -W0703 09:48:56.405000 140290787850048 torch/distributed/run.py:757] ***************************************** -W0703 09:48:56.405000 140290787850048 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:48:56.405000 140290787850048 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:49:20 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=2, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=4, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=128, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4')), -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 09:49:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default0]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=8|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=8|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=0|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=9|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=15|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=12|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=13|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=11|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=10|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=14|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=14|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=12|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=9|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-138]: No checkpoint path provided. -[default6]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=6|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=1|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=15|ip-26-0-166-125]: No checkpoint path provided. -[default2]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=2|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=11|ip-26-0-166-125]: No checkpoint path provided. -[default3]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=3|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=6|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=13|ip-26-0-166-125]: No checkpoint path provided. -[default2]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=10|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=5|ip-26-0-161-138]: No checkpoint path provided. -[default4]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=4|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=5|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=4|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 09:49:38 [INFO|DP=1|PP=1|TP=7|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 09:49:38 [INFO|DP=1|PP=0|TP=7|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=0|ip-26-0-163-147]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=0|ip-26-0-163-147]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=0|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=8|ip-26-0-164-207]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=8|ip-26-0-164-207]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=8|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=1|ip-26-0-163-147]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=1|ip-26-0-163-147]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=6|ip-26-0-163-147]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=6|ip-26-0-163-147]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=6|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=1|ip-26-0-163-147]: No checkpoint path provided. -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=2|ip-26-0-163-147]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=2|ip-26-0-163-147]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=2|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=5|ip-26-0-163-147]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=5|ip-26-0-163-147]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=5|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=4|ip-26-0-163-147]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=4|ip-26-0-163-147]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=4|ip-26-0-163-147]: No checkpoint path provided. -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=7|ip-26-0-163-147]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=7|ip-26-0-163-147]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=7|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=3|ip-26-0-163-147]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=3|ip-26-0-163-147]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=3|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=12|ip-26-0-164-207]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=12|ip-26-0-164-207]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=12|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=11|ip-26-0-164-207]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=11|ip-26-0-164-207]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=11|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=10|ip-26-0-164-207]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=10|ip-26-0-164-207]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=10|ip-26-0-164-207]: No checkpoint path provided. -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=9|ip-26-0-164-207]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=9|ip-26-0-164-207]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=9|ip-26-0-164-207]: No checkpoint path provided. -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=13|ip-26-0-164-207]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=13|ip-26-0-164-207]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=13|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=15|ip-26-0-164-207]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=15|ip-26-0-164-207]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=15|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=14|ip-26-0-164-207]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=14|ip-26-0-164-207]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 09:49:38 [INFO|DP=0|PP=1|TP=14|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 09:49:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:49:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:49:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 09:49:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 09:49:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:49:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 09:49:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:49:41 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:49:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:49:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:49:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 09:49:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 09:49:42.779762 | mbs: 4 | grad_accum: 128 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:49:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:49:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default4]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=12|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=6|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:49:42 [WARNING|DP=1|PP=1|TP=1|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=6|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=4|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=9|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=15|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=8|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=14|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:49:42 [WARNING|DP=1|PP=1|TP=12|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:49:43 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:49:43 [WARNING|DP=1|PP=1|TP=6|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:49:42 [WARNING|DP=1|PP=1|TP=0|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:49:42 [WARNING|DP=1|PP=1|TP=13|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:49:42 [WARNING|DP=1|PP=1|TP=10|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:49:43 [WARNING|DP=0|PP=1|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:49:42 [WARNING|DP=1|PP=0|TP=4|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:49:42 [WARNING|DP=1|PP=1|TP=7|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:49:42 [WARNING|DP=1|PP=1|TP=4|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=2|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:49:43 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=5|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:49:42 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=3|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=12|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=11|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=8|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=9|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:49:43 [WARNING|DP=0|PP=1|TP=15|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:49:42 [WARNING|DP=0|PP=1|TP=14|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:49:43 [WARNING|DP=1|PP=0|TP=13|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:49:43 [WARNING|DP=1|PP=1|TP=14|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:49:43 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:49:43 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:49:43 [WARNING|DP=1|PP=1|TP=9|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:49:43 [WARNING|DP=1|PP=1|TP=11|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:49:43 [WARNING|DP=1|PP=1|TP=2|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:49:43 [WARNING|DP=1|PP=0|TP=5|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:49:43 [WARNING|DP=1|PP=1|TP=5|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:49:43 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:49:43 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:49:43 [WARNING|DP=0|PP=1|TP=13|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:49:43 [WARNING|DP=1|PP=1|TP=3|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:49:43 [WARNING|DP=1|PP=1|TP=8|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:49:43 [WARNING|DP=0|PP=1|TP=1|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:49:43 [WARNING|DP=1|PP=1|TP=15|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:49:43 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:49:43 [WARNING|DP=0|PP=1|TP=7|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:49:43 [WARNING|DP=0|PP=1|TP=10|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank6]: dist.recv( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f471c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f4849cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f484a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f484a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5f93f3be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5f98f82609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5f98d4d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:[rank42]: Traceback (most recent call last): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f471c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f4849cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: trainer.train(dataloader) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f484a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f484a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5f93f3be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:frame #5: + 0x8609 (0x7f5f98f82609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5f98d4d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f471c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f5f48126119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f5f93f3be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:frame #3: + 0x8609 (0x7f5f98f82609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:frame #4: clone + 0x43 (0x7f5f98d4d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default0]: -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4399db6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f439b08fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f439b094a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f439b095dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f43e6b2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f43ebb75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f43eb940353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4399db6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f439b08fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f439b094a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f439b095dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f43e6b2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f43ebb75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f43eb940353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4399db6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f439ad19119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f43e6b2ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f43ebb75609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f43eb940353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5da7146897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5da841fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5da8424a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5da8425dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5df3ebee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5df8f05609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5df8cd0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5da7146897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5da841fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5da8424a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5da8425dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f5df3ebee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f5df8f05609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f5df8cd0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5da7146897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f5da80a9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f5df3ebee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f5df8f05609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f5df8cd0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb6e202897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffb6f4dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffb6f4e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffb6f4e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ffbbaf7ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ffbbffc1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ffbbfd8c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb6e202897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffb6f4dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffb6f4e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffb6f4e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ffbbaf7ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ffbbffc1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ffbbfd8c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb6e202897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7ffb6f165119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7ffbbaf7ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7ffbbffc1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7ffbbfd8c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6066407897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f60676e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f60676e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f60676e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f60b317fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f60b81c6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f60b7f91353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6066407897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f60676e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f60676e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f60676e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f60b317fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f60b81c6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f60b7f91353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6066407897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f606736a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f60b317fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f60b81c6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f60b7f91353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba5faa4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fba60d7dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fba60d82a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fba60d83dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4:[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. - + 0xd3e95 (0x7fbaac81ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fbab1863609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fbab162e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba5faa4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fba60d7dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fba60d82a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fba60d83dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fbaac81ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fbab1863609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fbab162e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fba5faa4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fba60a07119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fbaac81ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fbab1863609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fbab162e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa9a25d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa9a38afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa9a38b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa9a38b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa9ef34ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa9f4395609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa9f4160353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa9a25d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa9a38afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa9a38b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa9a38b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa9ef34ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa9f4395609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa9f4160353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa9a25d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fa9a3539119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fa9ef34ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fa9f4395609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fa9f4160353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fda7bcbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fda7cf98c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fda7cf9da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fda7cf9edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fdac8a37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fdacda7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fdacd849353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fda7bcbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fda7cf98c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fda7cf9da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fda7cf9edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fdac8a37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fdacda7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fdacd849353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fda7bcbf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fda7cc22119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fdac8a37e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fdacda7e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fdacd849353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd5ba86f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd5bbb48c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd5bbb4da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd5bbb4edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd6075e7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd60c62e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd60c3f9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd5ba86f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd5bbb48c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd5bbb4da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd5bbb4edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd6075e7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd60c62e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd60c3f9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd5ba86f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fd5bb7d2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fd6075e7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fd60c62e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fd60c3f9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7071fb4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f707328dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7073292a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7073293dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f70bed2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f70c3d73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f70c3b3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7071fb4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f707328dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7073292a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7073293dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f70bed2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f70c3d73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f70c3b3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7071fb4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f7072f17119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f70bed2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f70c3d73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f70c3b3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06f1298897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f06f2571c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f06f2576a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f06f2577dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f073e010e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f0743057609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f0742e22353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600058 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06f1298897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f06f2571c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f06f2576a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f06f2577dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f073e010e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f0743057609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f0742e22353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06f1298897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f06f21fb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f073e010e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f0743057609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f0742e22353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fccf0ae5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fccf1dbec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fccf1dc3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fccf1dc4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default6]:frame #4: + 0xd3e95 (0x7fcd3d85de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac141b1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fac1548ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7fcd428a4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #6: clone + 0x43 (0x7fcd4266f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef7976c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fac1548fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fac15490dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef7aa45c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fac60f29e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #5: + 0x8609 (0x7fac65f70609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fac65d3b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef7aa4aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef7aa4bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fccf0ae5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #4: + 0xd3e95 (0x7fefc64e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fccf1dbec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #5: + 0x8609 (0x7fefcb52b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fefcb2f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fccf1dc3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac141b1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fccf1dc4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:frame #4: + 0xd3e95 (0x7fcd3d85de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fcd428a4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fac1548ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fac1548fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef7976c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #6: clone + 0x43 (0x7fcd4266f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fac15490dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fac60f29e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fef7aa45c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default2]:frame #5: + 0x8609 (0x7fac65f70609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fef7aa4aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #6: clone + 0x43 (0x7fac65d3b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fef7aa4bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fccf0ae5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #4: + 0xd3e95 (0x7fefc64e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fefcb52b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]: -[default6]:frame #1: + 0xe32119 (0x7fccf1a48119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #2: + 0xd3e95 (0x7fcd3d85de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #6: clone + 0x43 (0x7fefcb2f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac141b1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fac15114119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: + 0x8609 (0x7fcd428a4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]: -[default6]:frame #4: clone + 0x43 (0x7fcd4266f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #2: + 0xd3e95 (0x7fac60f29e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]: -[default2]:frame #3: + 0x8609 (0x7fac65f70609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fef7976c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fef7a6cf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: clone + 0x43 (0x7fac65d3b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:frame #2: + 0xd3e95 (0x7fefc64e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fefcb52b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fefcb2f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf51cdb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf52fb4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf52fb9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf52fbadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fdf9ea53e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fdfa3a9a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fdfa3865353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf51cdb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf52fb4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf52fb9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf52fbadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fdf9ea53e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fdfa3a9a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fdfa3865353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf51cdb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fdf52c3e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fdf9ea53e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fdfa3a9a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fdfa3865353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f707b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f71a8cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f71a91a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f71a92dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3fbd52be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3fc2572609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3fc233d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f707b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3f71a8cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3f71a91a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3f71a92dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3fbd52be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3fc2572609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3fc233d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3f707b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3f71716119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f3fbd52be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f3fc2572609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f3fc233d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdfde81d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdfdfaf6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdfdfafba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdfdfafcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe02b595e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe0305dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe0303a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdfde81d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdfdfaf6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdfdfafba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdfdfafcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe02b595e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe0305dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe0303a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdfde81d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fdfdf780119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fe02b595e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fe0305dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fe0303a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdccba33897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcccd0cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcccd11a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcccd12dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdd187abe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdd1d7f2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdd1d5bd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdccba33897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcccd0cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcccd11a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcccd12dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdd187abe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdd1d7f2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdd1d5bd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdccba33897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fdccc996119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fdd187abe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fdd1d7f2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fdd1d5bd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9335e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb9348bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb9348c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb9348c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fb98035ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fb9853a5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fb985170353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9335e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb9348bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb9348c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb9348c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fb98035ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fb9853a5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fb985170353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9335e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fb934549119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fb98035ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fb9853a5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fb985170353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4fb2265897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4fb353ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4fb3543a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4fb3544dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4ffefdde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5004024609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5003def353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4fb2265897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4fb353ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4fb3543a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4fb3544dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4ffefdde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f5004024609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f5003def353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4fb2265897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f4fb31c8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f4ffefdde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f5004024609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f5003def353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3869155897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f386a42ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f386a433a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f386a434dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f38b5ecde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f38baf14609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f38bacdf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3869155897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f386a42ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f386a433a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f386a434dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f38b5ecde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f38baf14609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f38bacdf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3869155897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f386a0b8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f38b5ecde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f38baf14609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f38bacdf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fe1880897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5fe2b59c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5fe2b5ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5fe2b5fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f602e5f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f603363f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f603340a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fe1880897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5fe2b59c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5fe2b5ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5fe2b5fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f602e5f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f603363f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f603340a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fe1880897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f5fe27e3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f602e5f8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f603363f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f603340a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f550fe7d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5511156c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f551115ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f551115cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f555cbf5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f5561c3c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f5561a07353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f550fe7d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5511156c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f551115ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f551115cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f555cbf5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f5561c3c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f5561a07353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f550fe7d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f5510de0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f555cbf5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f5561c3c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f5561a07353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9bcbf20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9bcd1f9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9bcd1fea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9bcd1ffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: dist.recv( -[default5]:frame #4: + 0xd3e95 (0x7f9c18c98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9c1dcdf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9c1daaa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]: -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9bcbf20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9bcd1f9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9bcd1fea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9bcd1ffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f9c18c98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f9c1dcdf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9c1daaa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9bcbf20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f9bcce83119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f9c18c98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f9c1dcdf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f9c1daaa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe703a67897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe704d40c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe704d45a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe704d46dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe7507dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe755826609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe7555f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe703a67897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe704d40c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe704d45a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe704d46dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe7507dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe755826609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe7555f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe703a67897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fe7049ca119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fe7507dfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fe755826609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fe7555f1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank26]: dist.recv( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8581f20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f85831f9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f85831fea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f85831ffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f85cec98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f85d3cdf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f85d3aaa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8581f20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f85831f9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f85831fea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f85831ffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f85cec98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f85d3cdf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f85d3aaa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8581f20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f8582e83119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f85cec98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f85d3cdf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f85d3aaa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: _engine_run_backward( -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: dist.recv( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default5]:[rank29]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank24]: dist.recv( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f891c6c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f891d99cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f891d9a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f891d9a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f896943be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f896e482609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f896e24d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f891c6c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f891d99cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f891d9a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f891d9a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f896943be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f896e482609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f896e24d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f891c6c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f891d626119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f896943be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f896e482609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f896e24d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1af2007897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1af32e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1af32e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1af32e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1b3ed7fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1b43dc6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1b43b91353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1af2007897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1af32e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1af32e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1af32e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1b3ed7fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1b43dc6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1b43b91353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1af2007897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f1af2f6a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f1b3ed7fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f1b43dc6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f1b43b91353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3d04228897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3d05501c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3d05506a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3d05507dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3d50fa0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3d55fe7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3d55db2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3d04228897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3d05501c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3d05506a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3d05507dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f3d50fa0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f3d55fe7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f3d55db2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3d04228897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f3d0518b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f3d50fa0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f3d55fe7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f3d55db2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc258d6c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc25a045c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc25a04aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc25a04bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc2a5ae4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc2aab2b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc2aa8f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc258d6c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc25a045c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc25a04aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc25a04bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc2a5ae4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc2aab2b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc2aa8f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc258d6c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fc259ccf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fc2a5ae4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fc2aab2b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fc2aa8f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faf8aed4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faf8c1adc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faf8c1b2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faf8c1b3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fafd7c4ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fafdcc93609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fafdca5e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faf8aed4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faf8c1adc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faf8c1b2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faf8c1b3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fafd7c4ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fafdcc93609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fafdca5e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faf8aed4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7faf8be37119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fafd7c4ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fafdcc93609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fafdca5e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd65822b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd659504c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd659509a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd65950adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd6a4fa3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd6a9fea609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd6a9db5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd65822b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd659504c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd659509a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd65950adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd6a4fa3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd6a9fea609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd6a9db5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd65822b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fd65918e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fd6a4fa3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fd6a9fea609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fd6a9db5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd44f92b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd450c04c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd450c09a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd450c0adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd49c6a3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd4a16ea609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd4a14b5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f639882f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6399b08c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6399b0da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6399b0edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f63e55a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f63ea5ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f63ea3b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f639882f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6399b08c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6399b0da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6399b0edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f63e55a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f63ea5ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f63ea3b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f639882f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f6399792119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f63e55a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f63ea5ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f63ea3b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd44f92b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd450c04c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd450c09a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd450c0adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fd49c6a3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fd4a16ea609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fd4a14b5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd44f92b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fd45088e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fd49c6a3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fd4a16ea609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fd4a14b5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f52799c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f527aca1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank59]: sharded_logits = self.model( -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c81472897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2c8274bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f682c160897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f527aca6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f527aca7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2c82750a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2c82751dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:frame #4: + 0xd3e95 (0x7f52c6740e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #4: + 0xd3e95 (0x7f2cce1eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f682d439c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #5: + 0x8609 (0x7f52cb787609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f52cb552353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:frame #5: + 0x8609 (0x7f2cd3231609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f52799c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:frame #6: clone + 0x43 (0x7f2cd2ffc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f682d43ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f682d43fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f527aca1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6878ed8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f687df1f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f527aca6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f527aca7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default5]:frame #4: + 0xd3e95 (0x7f52c6740e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f52cb787609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default5]:frame #6: clone + 0x43 (0x7f52cb552353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c81472897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #6: clone + 0x43 (0x7f687dcea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: trainer.train(dataloader) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2c8274bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default4]: -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f52799c8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2c82750a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2c82751dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:frame #4: + 0xd3e95 (0x7f2cce1eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default2]:frame #5: + 0x8609 (0x7f2cd3231609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f2cd2ffc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:frame #1: + 0xe32119 (0x7f527a92b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f682c160897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:frame #2: + 0xd3e95 (0x7f52c6740e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f52cb787609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f682d439c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f682d43ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f682d43fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default4]:frame #4: + 0xd3e95 (0x7f6878ed8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:frame #4: clone + 0x43 (0x7f52cb552353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2c81472897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:frame #5: + 0x8609 (0x7f687df1f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f687dcea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:frame #1: + 0xe32119 (0x7f2c823d5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f682c160897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #2: + 0xd3e95 (0x7f2cce1eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f2cd3231609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #1: + 0xe32119 (0x7f682d0c3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:frame #2: + 0xd3e95 (0x7f6878ed8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #4: clone + 0x43 (0x7f2cd2ffc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:frame #3: + 0x8609 (0x7f687df1f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f687dcea353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: dist.recv( -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default4]: -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank19]: dist.recv( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0919a01897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f091acdac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f091acdfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f091ace0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f0966779e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f096b7c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f096b58b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0919a01897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f091acdac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f091acdfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f091ace0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f0966779e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f096b7c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f096b58b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0919a01897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f091a964119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f0966779e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f096b7c0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f096b58b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f873db66897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f873ee3fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f873ee44a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f873ee45dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f878a8dee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f878f925609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f878f6f0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f873db66897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f873ee3fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f873ee44a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f873ee45dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f878a8dee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f878f925609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f878f6f0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f873db66897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f873eac9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f878a8dee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f878f925609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f878f6f0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank51]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3089cce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f308afa7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f308afaca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f308afaddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default6]:frame #4: + 0xd3e95 (0x7f30d6a46e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f30dba8d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f30db858353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default7]:[rank55]: sharded_logits = self.model( -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3089cce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f308afa7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f308afaca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f308afaddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:frame #4: + 0xd3e95 (0x7f30d6a46e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:frame #5: + 0x8609 (0x7f30dba8d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f30db858353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3089cce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f308ac31119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f30d6a46e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f30dba8d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f30db858353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f771c30e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f771d5e7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f771d5eca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f771d5eddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7769086e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f776e0cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f776de98353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f771c30e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f771d5e7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f771d5eca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f771d5eddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f7769086e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f776e0cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f776de98353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f771c30e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f771d271119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f7769086e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f776e0cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f776de98353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: Traceback (most recent call last): -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: result = loss.backward() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd37f0c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd38039bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd3803a0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd3803a1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd3cbe3ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd3d0e81609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd3d0c4c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd37f0c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd38039bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd3803a0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd3803a1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fd3cbe3ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fd3d0e81609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fd3d0c4c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd37f0c2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fd380025119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fd3cbe3ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fd3d0e81609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fd3d0c4c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff4d7128897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff4d8401c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff4d8406a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff4d8407dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff523ea0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff528ee7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff528cb2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff4d7128897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff4d8401c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff4d8406a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff4d8407dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7ff523ea0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7ff528ee7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7ff528cb2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff4d7128897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7ff4d808b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7ff523ea0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff528ee7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7ff528cb2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02416df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02429b8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02429bda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02429bedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f028e457e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f029349e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f0293269353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02416df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f02429b8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f02429bda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f02429bedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f028e457e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f029349e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f0293269353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f02416df897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f0242642119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f028e457e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f029349e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f0293269353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1604d2e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1606007c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f160600ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f160600ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f1651aa6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f1656aed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f16568b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1604d2e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1606007c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f160600ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f160600ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f1651aa6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f1656aed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f16568b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1604d2e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f1605c91119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f1651aa6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f1656aed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f16568b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3abd72897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb3ad04bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb3ad050a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb3ad051dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb3f8aeae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb3fdb31609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb3fd8fc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3abd72897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb3ad04bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb3ad050a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb3ad051dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb3f8aeae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb3fdb31609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb3fd8fc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb3abd72897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fb3accd5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fb3f8aeae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fb3fdb31609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fb3fd8fc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f767f3c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f768069ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f768069fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f76806a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f76cc139e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f76d1180609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f76d0f4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f767f3c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f768069ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f768069fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f76806a0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f76cc139e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f76d1180609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f76d0f4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f767f3c1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f7680324119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f76cc139e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f76d1180609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f76d0f4b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7d12aae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7d13d87c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7d13d8ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7d13d8ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7d5f826e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7d6486d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7d64638353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7d12aae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7d13d87c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7d13d8ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7d13d8ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7d5f826e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7d6486d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7d64638353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7d12aae897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f7d13a11119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f7d5f826e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f7d6486d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f7d64638353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f714a17b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f714b454c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f714b459a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f714b45adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7196ef3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f719bf3a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f719bd05353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f714a17b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f714b454c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f714b459a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f714b45adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7196ef3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f719bf3a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f719bd05353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f714a17b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f714b0de119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f7196ef3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f719bf3a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f719bd05353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4c692c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4c7c05c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4c7c0aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7283e45897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4c7c0bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa5136a4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f728511ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #5: + 0x8609 (0x7fa5186eb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7285123a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #6: clone + 0x43 (0x7fa5184b6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7285124dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: -[default5]:frame #4: + 0xd3e95 (0x7f72d0bbde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f72d5c04609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f72d59cf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4c692c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa4c7c05c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7283e45897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa4c7c0aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f728511ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7285123a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7285124dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa4c7c0bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fa5136a4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fa5186eb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: + 0xd3e95 (0x7f72d0bbde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #6: clone + 0x43 (0x7fa5184b6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #5: + 0x8609 (0x7f72d5c04609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f72d59cf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa4c692c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]: -[default3]:frame #1: + 0xe32119 (0x7fa4c788f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #2: + 0xd3e95 (0x7fa5136a4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7283e45897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #3: + 0x8609 (0x7fa5186eb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fa5184b6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:frame #1: + 0xe32119 (0x7f7284da8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f72d0bbde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f72d5c04609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f72d59cf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3963e81897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f396515ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f396515fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3965160dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f39b0bf9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f39b5c40609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f39b5a0b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3963e81897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f396515ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f396515fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3965160dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f39b0bf9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f39b5c40609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f39b5a0b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3963e81897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3964de4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f39b0bf9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f39b5c40609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f39b5a0b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4475530897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4476809c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f447680ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f447680fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f44c22a8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f44c72ef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f44c70ba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4475530897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4476809c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f447680ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f447680fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f44c22a8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f44c72ef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f44c70ba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4475530897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f4476493119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f44c22a8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f44c72ef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f44c70ba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fda03bec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fda04ec5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fda04ecaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fda04ecbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fda50964e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fda559ab609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fda55776353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fda03bec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fda04ec5c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fda04ecaa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fda04ecbdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fda50964e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fda559ab609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fda55776353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fda03bec897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fda04b4f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fda50964e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fda559ab609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fda55776353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4b48ff1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4b4a2cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4b4a2cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4b4a2d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f4b95d69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f4b9adb0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4b9ab7b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4b48ff1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4b4a2cac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4b4a2cfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4b4a2d0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f4b95d69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f4b9adb0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4b9ab7b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4b48ff1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f4b49f54119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f4b95d69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f4b9adb0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f4b9ab7b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd69ad4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd6adadc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd6adb2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd6adb3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fcdb684ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fcdbb893609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fcdbb65e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd69ad4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcd6adadc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcd6adb2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcd6adb3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fcdb684ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fcdbb893609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fcdbb65e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcd69ad4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fcd6aa37119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fcdb684ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fcdbb893609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fcdbb65e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd666daa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd668083c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd668088a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd668089dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd6b3b22e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd6b8b69609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd6b8934353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd666daa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd668083c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd668088a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd668089dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd6b3b22e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd6b8b69609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd6b8934353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd666daa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fd667d0d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fd6b3b22e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1bd[default0]:frame #3: + 0x8609 (0x7fd6b8b69609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fd6b8934353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -6b93897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1bd7e6cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1bd7e71a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1bd7e72dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1c2390be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1c28952609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1c2871d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1bd6b93897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1bd7e6cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1bd7e71a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1bd7e72dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1c2390be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1c28952609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1c2871d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1bd6b93897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f1bd7af6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f1c2390be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f1c28952609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f1c2871d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9106415897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f91076eec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f91076f3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f91076f4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f915318de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f91581d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f9157f9f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9106415897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f91076eec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f91076f3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f91076f4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f915318de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f91581d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f9157f9f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9106415897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f9107378119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f915318de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f91581d4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f9157f9f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efde163b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efde2914c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efde2919a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efde291adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7efe2e3b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7efe333fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7efe331c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efde163b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7efde2914c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7efde2919a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7efde291adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7efe2e3b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7efe333fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7efe331c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efde163b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7efde259e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7efe2e3b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7efe333fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7efe331c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4fb6d3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4fb8017c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4fb801ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4fb801ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5003ab6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5008afd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f50088c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4fb6d3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4fb8017c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4fb801ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4fb801ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f5003ab6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f5008afd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f50088c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4fb6d3e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f4fb7ca1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f5003ab6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f5008afd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f50088c8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76d1b86897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f76d2e5fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f76d2e64a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f76d2e65dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f771e8fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f7723945609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f7723710353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76d1b86897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f76d2e5fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f76d2e64a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f76d2e65dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f771e8fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f7723945609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f7723710353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f76d1b86897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f76d2ae9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f771e8fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f7723945609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f7723710353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f58d19897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f59ff2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f59ff7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f59ff8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f5fa5a91e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5faaad8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f5faa8a3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=16384, NumelOut=16384, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f58d19897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5f59ff2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5f59ff7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5f59ff8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f5fa5a91e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f5faaad8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f5faa8a3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5f58d19897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f5f59c7c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f5fa5a91e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f5faaad8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f5faa8a3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8187b50897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8188e29c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8188e2ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8188e2fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f81d48c8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f81d990f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f81d96da353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=2097152, NumelOut=2097152, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8187b50897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8188e29c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8188e2ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8188e2fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f81d48c8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f81d990f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f81d96da353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8187b50897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f8188ab3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f81d48c8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f81d990f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f81d96da353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -W0703 10:00:03.153000 139720356824896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 79077 closing signal SIGTERM -W0703 10:00:03.153000 139720356824896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 79078 closing signal SIGTERM -W0703 10:00:03.153000 139720356824896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 79079 closing signal SIGTERM -W0703 10:00:03.153000 139720356824896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 79080 closing signal SIGTERM -W0703 10:00:03.153000 139720356824896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 79081 closing signal SIGTERM -W0703 10:00:03.153000 139720356824896 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 79083 closing signal SIGTERM -E0703 10:00:04.499000 139720356824896 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 79076) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:00:03 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 79082) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 79082 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:00:03 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 79076) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 79076 -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -W0703 10:00:06.923000 139775469889280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-164-207.ec2.internal_474507_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:07.192000 140143948781312 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-138.ec2.internal_746227_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:07.371000 139750468667136 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_964214_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:07.689000 140049200645888 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-166-125.ec2.internal_208089_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:07.706000 139718156400384 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_45031_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:07.877000 140398149916416 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-147.ec2.internal_861909_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:07.880000 140285127116544 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-103.ec2.internal_949171_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:08.093000 140403810649920 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 861987 closing signal SIGTERM -W0703 10:00:08.116000 139723817133888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45103 closing signal SIGTERM -W0703 10:00:08.116000 139723817133888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45104 closing signal SIGTERM -W0703 10:00:08.116000 139723817133888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45105 closing signal SIGTERM -W0703 10:00:08.116000 139723817133888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45106 closing signal SIGTERM -W0703 10:00:08.117000 139723817133888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45107 closing signal SIGTERM -W0703 10:00:08.117000 139723817133888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45108 closing signal SIGTERM -W0703 10:00:08.117000 139723817133888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45109 closing signal SIGTERM -W0703 10:00:08.117000 139723817133888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45110 closing signal SIGTERM -W0703 10:00:08.145000 140149609514816 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 746300 closing signal SIGTERM -W0703 10:00:08.146000 140149609514816 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 746301 closing signal SIGTERM -W0703 10:00:08.146000 140149609514816 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 746302 closing signal SIGTERM -W0703 10:00:08.146000 140149609514816 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 746303 closing signal SIGTERM -W0703 10:00:08.146000 140149609514816 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 746304 closing signal SIGTERM -W0703 10:00:08.146000 140149609514816 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 746305 closing signal SIGTERM -W0703 10:00:08.146000 140149609514816 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 746306 closing signal SIGTERM -W0703 10:00:08.146000 140149609514816 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 746307 closing signal SIGTERM -W0703 10:00:08.148000 139756129400640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 964287 closing signal SIGTERM -W0703 10:00:08.148000 139756129400640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 964289 closing signal SIGTERM -W0703 10:00:08.148000 139756129400640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 964290 closing signal SIGTERM -W0703 10:00:08.148000 139756129400640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 964292 closing signal SIGTERM -W0703 10:00:08.148000 139756129400640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 964293 closing signal SIGTERM -W0703 10:00:08.148000 139756129400640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 964294 closing signal SIGTERM -W0703 10:00:08.204000 139781130622784 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 474582 closing signal SIGTERM -E0703 10:00:08.268000 140054861379392 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 208163) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:00:08.281000 140054861379392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_208089_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:08.307000 140054861379392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_208089_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 10:00:08.334000 140290787850048 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 949243) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:00:08.335000 140054861379392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-166-125.ec2.internal_208089_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:00:08 - host : ip-26-0-166-125.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 208164) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 208164 -[2]: - time : 2024-07-03_10:00:08 - host : ip-26-0-166-125.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 208165) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 208165 -[3]: - time : 2024-07-03_10:00:08 - host : ip-26-0-166-125.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 208166) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 208166 -[4]: - time : 2024-07-03_10:00:08 - host : ip-26-0-166-125.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 208167) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 208167 -[5]: - time : 2024-07-03_10:00:08 - host : ip-26-0-166-125.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 208168) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 208168 -[6]: - time : 2024-07-03_10:00:08 - host : ip-26-0-166-125.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 208169) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 208169 -[7]: - time : 2024-07-03_10:00:08 - host : ip-26-0-166-125.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 208170) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 208170 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:00:08 - host : ip-26-0-166-125.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 208163) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 208163 -============================================================ -W0703 10:00:08.346000 140290787850048 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_949171_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:08.374000 140290787850048 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_949171_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:08.402000 140290787850048 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_949171_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:00:08 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 949244) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 949244 -[2]: - time : 2024-07-03_10:00:08 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 949245) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 949245 -[3]: - time : 2024-07-03_10:00:08 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 949246) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 949246 -[4]: - time : 2024-07-03_10:00:08 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 949247) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 949247 -[5]: - time : 2024-07-03_10:00:08 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 949248) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 949248 -[6]: - time : 2024-07-03_10:00:08 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 949249) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 949249 -[7]: - time : 2024-07-03_10:00:08 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 949250) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 949250 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:00:08 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 949243) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 949243 -============================================================ -E0703 10:00:08.546000 140403810649920 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 861982) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:00:08.559000 140403810649920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_861909_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:08.589000 140403810649920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_861909_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:08.620000 140403810649920 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-147.ec2.internal_861909_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:00:08 - host : ip-26-0-163-147.ec2.internal - rank : 33 (local_rank: 1) - exitcode : -6 (pid: 861983) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 861983 -[2]: - time : 2024-07-03_10:00:08 - host : ip-26-0-163-147.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 861984) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 861984 -[3]: - time : 2024-07-03_10:00:08 - host : ip-26-0-163-147.ec2.internal - rank : 35 (local_rank: 3) - exitcode : -6 (pid: 861985) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 861985 -[4]: - time : 2024-07-03_10:00:08 - host : ip-26-0-163-147.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 861986) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 861986 -[5]: - time : 2024-07-03_10:00:08 - host : ip-26-0-163-147.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 861988) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 861988 -[6]: - time : 2024-07-03_10:00:08 - host : ip-26-0-163-147.ec2.internal - rank : 39 (local_rank: 7) - exitcode : -6 (pid: 861989) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 861989 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:00:08 - host : ip-26-0-163-147.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 861982) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 861982 -============================================================ -E0703 10:00:08.675000 139781130622784 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 474580) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:00:08.689000 139781130622784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_474507_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:08.717000 139781130622784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_474507_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:08.742000 139781130622784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_474507_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:00:08 - host : ip-26-0-164-207.ec2.internal - rank : 41 (local_rank: 1) - exitcode : -6 (pid: 474581) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 474581 -[2]: - time : 2024-07-03_10:00:08 - host : ip-26-0-164-207.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 474583) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 474583 -[3]: - time : 2024-07-03_10:00:08 - host : ip-26-0-164-207.ec2.internal - rank : 44 (local_rank: 4) - exitcode : -6 (pid: 474584) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 474584 -[4]: - time : 2024-07-03_10:00:08 - host : ip-26-0-164-207.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 474585) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 474585 -[5]: - time : 2024-07-03_10:00:08 - host : ip-26-0-164-207.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 474586) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 474586 -[6]: - time : 2024-07-03_10:00:08 - host : ip-26-0-164-207.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 474587) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 474587 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:00:08 - host : ip-26-0-164-207.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 474580) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 474580 -============================================================ -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -srun: error: ip-26-0-166-125: task 7: Exited with exit code 1 -srun: error: ip-26-0-163-147: task 4: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 5: Exited with exit code 1 -E0703 10:00:09.639000 139756129400640 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 964288) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:00:09.652000 139756129400640 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_964214_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:09.678000 139756129400640 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_964214_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:09.689000 139756129400640 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_964214_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:00:08 - host : ip-26-0-165-24.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 964291) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 964291 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:00:08 - host : ip-26-0-165-24.ec2.internal - rank : 49 (local_rank: 1) - exitcode : -6 (pid: 964288) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 964288 -============================================================ -W0703 10:00:10.138000 139723817133888 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_45031_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:10.151000 139723817133888 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_45031_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 10:00:10.269000 140149609514816 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_746227_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:00:10.279000 140149609514816 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_746227_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-165-24: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-161-138: task 3: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-4/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/bench.slurm deleted file mode 100644 index ce51bf4ddd33cb558ad699c1d48142f3a95e78e0..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/config.yaml deleted file mode 100644 index 1e3567532a150a4e73f5f86c574a08a8aaf5bb71..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 1 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 512 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/log.out deleted file mode 100644 index e5167fb28c70d41599fdcae6a872a01ab58f8c44..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/log.out +++ /dev/null @@ -1,6285 +0,0 @@ -======================== -START TIME: Wed Jul 3 00:24:47 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 00:24:50.297000 139640441337664 torch/distributed/run.py:757] -W0703 00:24:50.297000 139640441337664 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.297000 139640441337664 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:24:50.297000 139640441337664 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.298000 140622279550784 torch/distributed/run.py:757] -W0703 00:24:50.298000 140622279550784 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.298000 140622279550784 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:24:50.298000 140622279550784 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.298000 140611591423808 torch/distributed/run.py:757] -W0703 00:24:50.298000 140611591423808 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.298000 140611591423808 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:24:50.298000 140611591423808 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.296000 139638215915328 torch/distributed/run.py:757] -W0703 00:24:50.296000 139638215915328 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.296000 139638215915328 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:24:50.296000 139638215915328 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.304000 139732478457664 torch/distributed/run.py:757] -W0703 00:24:50.304000 139732478457664 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.304000 139732478457664 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:24:50.304000 139732478457664 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.311000 139859554359104 torch/distributed/run.py:757] -W0703 00:24:50.311000 139859554359104 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.311000 139859554359104 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:24:50.311000 139859554359104 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.315000 140173338302272 torch/distributed/run.py:757] -W0703 00:24:50.315000 140173338302272 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.315000 140173338302272 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:24:50.315000 140173338302272 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.350000 140133463041856 torch/distributed/run.py:757] -W0703 00:24:50.350000 140133463041856 torch/distributed/run.py:757] ***************************************** -W0703 00:24:50.350000 140133463041856 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:24:50.350000 140133463041856 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 00:25:10 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=2, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=512, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512')), -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 00:25:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default2]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=10|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=8|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=13|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=9|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=15|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=12|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=6|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=5|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=14|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=11|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=4|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 00:25:27 [INFO|DP=1|PP=1|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 00:25:27 [INFO|DP=1|PP=0|TP=7|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=0|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=3|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=1|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=6|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=7|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=2|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=5|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=4|ip-26-0-162-233]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 00:25:27 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 00:25:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 00:25:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 00:25:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 00:25:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 00:25:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 00:25:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 00:25:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:25:30 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:25:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 00:25:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 00:25:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 00:25:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 00:25:31.408721 | mbs: 512 | grad_accum: 1 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 00:25:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 00:25:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default2]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=8|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=13|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=15|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=12|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=5|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=14|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=6|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=9|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:25:31 [WARNING|DP=1|PP=0|TP=4|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:25:31 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:25:31 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:25:31 [WARNING|DP=1|PP=1|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank24]: output = self.o_proj(attention_output) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank24]: return row_linear( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank25]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default5]:[rank21]: Traceback (most recent call last): -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.55 GiB is free. Including non-PyTorch memory, this process has 73.77 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank21]: output = self.o_proj(attention_output) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: return row_linear( -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.50 GiB is free. Including non-PyTorch memory, this process has 73.82 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank22]: output = self.o_proj(attention_output) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.59 GiB is free. Including non-PyTorch memory, this process has 73.73 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: output = self.o_proj(attention_output) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.38 GiB is free. Including non-PyTorch memory, this process has 73.94 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: Traceback (most recent call last): -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: output = model(**micro_batch) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: output = model(**micro_batch) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank29]: sharded_logits = self.model( -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: sharded_logits = self.model( -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in[default3]:[rank27]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl - forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, *[default4]:[rank28]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -*kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank17]: output = self.o_proj(attention_output) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: [default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.50 GiB is free. Including non-PyTorch memory, this process has 73.82 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try settin[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -g PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default6]:[rank30]: output = self.o_proj(attention_output) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: output = model(**micro_batch) -[default7]:[rank31]: output = self.o_proj(attention_output) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default3]:[rank27]: output = self.o_proj(attention_output) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: output = self.o_proj(attention_output) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return row_linear( -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default4]:[rank28]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default3]:[rank27]: return row_linear( -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.55 GiB is free. Including non-PyTorch memory, this process has 73.77 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.38 GiB is free. Including non-PyTorch memory, this process has 73.94 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return row_linear( -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.55 GiB is free. Including non-PyTorch memory, this process has 73.77 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.55 GiB is free. Including non-PyTorch memory, this process has 73.77 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank28]: output = self.o_proj(attention_output) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: return row_linear( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.38 GiB is free. Including non-PyTorch memory, this process has 73.94 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: sharded_logits = self.model( -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: output = self.o_proj(attention_output) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: trainer.train(dataloader) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: trainer.train(dataloader) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank16]: output = self.o_proj(attention_output) -[default3]:[rank19]: trainer.train(dataloader) -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.59 GiB is free. Including non-PyTorch memory, this process has 73.73 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return row_linear( -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: sharded_logits = self.model( -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default3]:[rank19]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: output = self.o_proj(attention_output) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank23]: output = self.o_proj(attention_output) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.59 GiB is free. Including non-PyTorch memory, this process has 73.73 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: output = self.o_proj(attention_output) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.50 GiB is free. Including non-PyTorch memory, this process has 73.82 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.50 GiB is free. Including non-PyTorch memory, this process has 73.82 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: sharded_logits = self.model( -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank0]: output = self.o_proj(attention_output) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank8]: output = self.o_proj(attention_output) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: Traceback (most recent call last): -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: trainer.train(dataloader) -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return row_linear( -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15][default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return row_linear( -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.59 GiB is free. Including non-PyTorch memory, this process has 73.73 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: output = model(**micro_batch) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*arg[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -s, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank2]: trainer.train(dataloader) -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.09 GiB is free. Including non-PyTorch memory, this process has 73.23 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank10]: output = self.o_proj(attention_output) -[default1]:[rank1]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default4]:[rank4]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.50 GiB is free. Including non-PyTorch memory, this process has 73.82 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: output = model(**micro_batch) -[default3]:[rank11]: Traceback (most recent call last): -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: trainer.train(dataloader) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = self.o_proj(attention_output) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: output = self.o_proj(attention_output) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default1]:[rank1]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.17 GiB is free. Including non-PyTorch memory, this process has 73.15 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.09 GiB is free. Including non-PyTorch memory, this process has 73.23 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.17 GiB is free. Including non-PyTorch memory, this process has 73.15 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default3]:[rank11]: return forward_call(*args, **kwargs) -n/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.09 GiB is free. Including non-PyTorch memory, this process has 73.23 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank14]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: sharded_logits = self.model( -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.50 GiB is free. Including non-PyTorch memory, this process has 73.82 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return row_linear( -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: output = model(**micro_batch) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.17 GiB is free. Including non-PyTorch memory, this process has 73.15 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.09 GiB is free. Including non-PyTorch memory, this process has 73.23 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return row_linear( -[default4]:[rank12]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.59 GiB is free. Including non-PyTorch memory, this process has 73.73 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.50 GiB is free. Including non-PyTorch memory, this process has 73.82 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.59 GiB is free. Including non-PyTorch memory, this process has 73.73 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 252, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: output = self.o_proj(attention_output) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 5.59 GiB is free. Including non-PyTorch memory, this process has 73.73 GiB memory in use. Of the allocated memory 63.46 GiB is allocated by PyTorch, and 971.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank62]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank62]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f44b543c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank62]: frame #1: + 0x5b3a23e (0x7f44eef5923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f44eef53c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f44eef53f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f44eef54fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f44eef09371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f44eef09371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f44eef09371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f44eef09371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f44b6716189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f44b671d610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f44b673c978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank62]: frame #12: + 0x5adc309 (0x7f44eeefb309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #13: + 0x5ae6f10 (0x7f44eef05f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #14: + 0x5ae6fa5 (0x7f44eef05fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #15: + 0x5124446 (0x7f44ee543446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #16: + 0x1acf4b8 (0x7f44eaeee4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #17: + 0x5aee004 (0x7f44eef0d004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #18: + 0x5af36b5 (0x7f44eef126b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank62]: frame #19: + 0xd2631e (0x7f4501afc31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #20: + 0x47def4 (0x7f4501253ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank62]: frame #21: + 0x1445a6 (0x56161f1f45a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56161f1eda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #23: + 0x150866 (0x56161f200866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56161f1e9142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56161f1f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #26: PyObject_Call + 0xbc (0x56161f200f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56161f1e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56161f1f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56161f1e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #30: + 0x150582 (0x56161f200582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56161f1e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #32: + 0x150582 (0x56161f200582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56161f1e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #34: + 0x150582 (0x56161f200582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56161f1e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56161f1ecf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56161f1fec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #38: + 0x211239 (0x56161f2c1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56161f1eda6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56161f1e93e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56161f1f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56161f1e4c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56161f1f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56161f1e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #45: + 0x150582 (0x56161f200582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #46: PyObject_Call + 0xbc (0x56161f200f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56161f1e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #48: + 0x150582 (0x56161f200582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #49: PyObject_Call + 0xbc (0x56161f200f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56161f1e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56161f1f4a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56161f1ed007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56161f1fec39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #54: + 0x211239 (0x56161f2c1239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #55: PyObject_Call + 0x207 (0x56161f201067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56161f1e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #57: + 0x150582 (0x56161f200582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56161f1e58fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #59: + 0x150582 (0x56161f200582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #60: PyObject_Call + 0xbc (0x56161f200f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56161f1e72b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #62: + 0x150582 (0x56161f200582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: frame #63: PyObject_Call + 0xbc (0x56161f200f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank62]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank32]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank32]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8db8e35897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank32]: frame #1: + 0x5b3a23e (0x7f8df295223e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f8df294cc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f8df294cf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f8df294dfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8df2902371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8df2902371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8df2902371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8df2902371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f8dba10f189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f8dba116610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f8dba135978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #12: + 0x5adc309 (0x7f8df28f4309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #13: + 0x5ae6f10 (0x7f8df28fef10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #14: + 0x5ae6fa5 (0x7f8df28fefa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #15: + 0x5124446 (0x7f8df1f3c446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #16: + 0x1acf4b8 (0x7f8dee8e74b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #17: + 0x5aee004 (0x7f8df2906004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #18: + 0x5af36b5 (0x7f8df290b6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #19: + 0xd2631e (0x7f8e054f531e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #20: + 0x47def4 (0x7f8e04c4cef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank32]: frame #21: + 0x1445a6 (0x564b0c1175a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #22: _PyObject_MakeTpCall + 0x26b (0x564b0c110a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #23: + 0x150866 (0x564b0c123866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x564b0c10c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #25: _PyFunction_Vectorcall + 0x6c (0x564b0c117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #26: PyObject_Call + 0xbc (0x564b0c123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x564b0c10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #28: _PyFunction_Vectorcall + 0x6c (0x564b0c117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x564b0c1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: frame #30: + 0x150582 (0x564b0c123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x564b0c1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: frame #32: + 0x150582 (0x564b0c123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x564b0c1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: frame #34: + 0x150582 (0x564b0c123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x564b0c1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x564b0c10ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #37: _PyObject_Call_Prepend + 0x69 (0x564b0c121c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: frame #38: + 0x211239 (0x564b0c1e4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #39: _PyObject_MakeTpCall + 0x26b (0x564b0c110a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x564b0c10c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: frame #41: _PyFunction_Vectorcall + 0x6c (0x564b0c117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x564b0c107c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: frame #43: _PyFunction_Vectorcall + 0x6c (0x564b0c117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x564b0c1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: frame #45: + 0x150582 (0x564b0c123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #46: PyObject_Call + 0xbc (0x564b0c123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x564b0c10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #48: + 0x150582 (0x564b0c123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: frame #49: PyObject_Call + 0xbc (0x564b0c123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x564b0c10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank36]: dist.recv( -[default0]:[rank32]: frame #51: _PyFunction_Vectorcall + 0x6c (0x564b0c117a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default0]:[rank32]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x564b0c110007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #53: _PyObject_Call_Prepend + 0x69 (0x564b0c121c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: frame #54: + 0x211239 (0x564b0c1e4239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: frame #55: PyObject_Call + 0x207 (0x564b0c124067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank32]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x564b0c10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank32]: frame #57: + 0x150582 (0x564b0c123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank32]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x564b0c1088fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2638782897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank36]: frame #1: + 0x5b3a23e (0x7f267229f23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f2672299c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #59: + 0x150582 (0x564b0c123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f2672299f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f267229afd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #60: PyObject_Call + 0xbc (0x564b0c123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f267224f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f267224f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f267224f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x564b0c10a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f267224f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f2639a5c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f2639a63610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank36]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f2639a82978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank32]: frame #62: + 0x150582 (0x564b0c123582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #12: + 0x5adc309 (0x7f2672241309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: frame #63: PyObject_Call + 0xbc (0x564b0c123f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #13: + 0x5ae6f10 (0x7f267224bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank32]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank36]: frame #14: + 0x5ae6fa5 (0x7f267224bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #15: + 0x5124446 (0x7f2671889446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #16: + 0x1acf4b8 (0x7f266e2344b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #17: + 0x5aee004 (0x7f2672253004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #18: + 0x5af36b5 (0x7f26722586b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank36]: frame #19: + 0xd2631e (0x7f2684e4231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #20: + 0x47def4 (0x7f2684599ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank36]: frame #21: + 0x1445a6 (0x564cc71475a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #22: _PyObject_MakeTpCall + 0x26b (0x564cc7140a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #23: + 0x150866 (0x564cc7153866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x564cc713c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #25: _PyFunction_Vectorcall + 0x6c (0x564cc7147a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #26: PyObject_Call + 0xbc (0x564cc7153f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x564cc713a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #28: _PyFunction_Vectorcall + 0x6c (0x564cc7147a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x564cc71388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #30: + 0x150582 (0x564cc7153582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x564cc71388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #32: + 0x150582 (0x564cc7153582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x564cc71388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #34: + 0x150582 (0x564cc7153582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x564cc71388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x564cc713ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #37: _PyObject_Call_Prepend + 0x69 (0x564cc7151c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #38: + 0x211239 (0x564cc7214239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #39: _PyObject_MakeTpCall + 0x26b (0x564cc7140a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x564cc713c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #41: _PyFunction_Vectorcall + 0x6c (0x564cc7147a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x564cc7137c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #43: _PyFunction_Vectorcall + 0x6c (0x564cc7147a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x564cc71388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #45: + 0x150582 (0x564cc7153582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #46: PyObject_Call + 0xbc (0x564cc7153f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x564cc713a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #48: + 0x150582 (0x564cc7153582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #49: PyObject_Call + 0xbc (0x564cc7153f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x564cc713a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #51: _PyFunction_Vectorcall + 0x6c (0x564cc7147a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x564cc7140007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #53: _PyObject_Call_Prepend + 0x69 (0x564cc7151c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #54: + 0x211239 (0x564cc7214239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #55: PyObject_Call + 0x207 (0x564cc7154067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x564cc713a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #57: + 0x150582 (0x564cc7153582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x564cc71388fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #59: + 0x150582 (0x564cc7153582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #60: PyObject_Call + 0xbc (0x564cc7153f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x564cc713a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #62: + 0x150582 (0x564cc7153582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: frame #63: PyObject_Call + 0xbc (0x564cc7153f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank36]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ef2fce897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank37]: frame #1: + 0x5b3a23e (0x7f4f2caeb23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4f2cae5c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4f2cae5f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4f2cae6fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4f2ca9b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4f2ca9b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4f2ca9b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4f2ca9b371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f4ef42a8189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4ef42af610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank37]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4ef42ce978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #12: + 0x5adc309 (0x7f4f2ca8d309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #13: + 0x5ae6f10 (0x7f4f2ca97f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank33]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank37]: frame #14: + 0x5ae6fa5 (0x7f4f2ca97fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fbf4d869897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank37]: frame #15: + 0x5124446 (0x7f4f2c0d5446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #1: + 0x5b3a23e (0x7fbf8738623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #16: + 0x1acf4b8 (0x7f4f28a804b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fbf87380c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fbf87380f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fbf87381fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #17: + 0x5aee004 (0x7f4f2ca9f004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbf87336371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #18: + 0x5af36b5 (0x7f4f2caa46b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #19: + 0xd2631e (0x7f4f3f68e31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbf87336371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #20: + 0x47def4 (0x7f4f3ede5ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #21: + 0x1445a6 (0x56217e86a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56217e863a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbf87336371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #23: + 0x150866 (0x56217e876866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fbf87336371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fbf4eb43189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank37]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56217e85f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56217e86aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fbf4eb4a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fbf4eb69978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank33]: frame #12: + 0x5adc309 (0x7fbf87328309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #13: + 0x5ae6f10 (0x7fbf87332f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #26: PyObject_Call + 0xbc (0x56217e876f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56217e85d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56217e86aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #14: + 0x5ae6fa5 (0x7fbf87332fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56217e85b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #15: + 0x5124446 (0x7fbf86970446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #30: + 0x150582 (0x56217e876582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56217e85b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #32: + 0x150582 (0x56217e876582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56217e85b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #34: + 0x150582 (0x56217e876582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56217e85b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56217e862f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56217e874c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #38: + 0x211239 (0x56217e937239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56217e863a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #16: + 0x1acf4b8 (0x7fbf8331b4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank37]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56217e85f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #17: + 0x5aee004 (0x7fbf8733a004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #18: + 0x5af36b5 (0x7fbf8733f6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank33]: frame #19: + 0xd2631e (0x7fbf99f2931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank33]: frame #20: + 0x47def4 (0x7fbf99680ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank37]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56217e86aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #21: + 0x1445a6 (0x55d1e0f315a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56217e85ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56217e86aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56217e85b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #45: + 0x150582 (0x56217e876582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #46: PyObject_Call + 0xbc (0x56217e876f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d1e0f2aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56217e85d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #48: + 0x150582 (0x56217e876582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #49: PyObject_Call + 0xbc (0x56217e876f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #23: + 0x150866 (0x55d1e0f3d866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d1e0f26142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d1e0f31a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56217e85d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56217e86aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56217e863007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56217e874c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #54: + 0x211239 (0x56217e937239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #55: PyObject_Call + 0x207 (0x56217e877067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #26: PyObject_Call + 0xbc (0x55d1e0f3df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1e0f242b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d1e0f31a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56217e85d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d1e0f228fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #30: + 0x150582 (0x55d1e0f3d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d1e0f228fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #32: + 0x150582 (0x55d1e0f3d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d1e0f228fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #34: + 0x150582 (0x55d1e0f3d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #57: + 0x150582 (0x56217e876582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d1e0f228fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56217e85b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d1e0f29f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #59: + 0x150582 (0x56217e876582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d1e0f3bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #60: PyObject_Call + 0xbc (0x56217e876f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #38: + 0x211239 (0x55d1e0ffe239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56217e85d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d1e0f2aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d1e0f263e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d1e0f31a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #62: + 0x150582 (0x56217e876582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d1e0f21c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d1e0f31a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d1e0f228fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #45: + 0x150582 (0x55d1e0f3d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: frame #63: PyObject_Call + 0xbc (0x56217e876f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #46: PyObject_Call + 0xbc (0x55d1e0f3df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1e0f242b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #48: + 0x150582 (0x55d1e0f3d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #49: PyObject_Call + 0xbc (0x55d1e0f3df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1e0f242b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d1e0f31a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank37]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank33]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d1e0f2a007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d1e0f3bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #54: + 0x211239 (0x55d1e0ffe239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #55: PyObject_Call + 0x207 (0x55d1e0f3e067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1e0f242b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #57: + 0x150582 (0x55d1e0f3d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d1e0f228fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #59: + 0x150582 (0x55d1e0f3d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #60: PyObject_Call + 0xbc (0x55d1e0f3df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d1e0f242b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #62: + 0x150582 (0x55d1e0f3d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: frame #63: PyObject_Call + 0xbc (0x55d1e0f3df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank33]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank57]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank57]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f229d871897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank57]: frame #1: + 0x5b3a23e (0x7f22d738e23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f22d7388c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f22d7388f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f22d7389fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f22d733e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f22d733e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f22d733e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f22d733e371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f229eb4b189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f229eb52610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f229eb71978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank57]: frame #12: + 0x5adc309 (0x7f22d7330309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #13: + 0x5ae6f10 (0x7f22d733af10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #14: + 0x5ae6fa5 (0x7f22d733afa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-pack[default1]:[rank57]: frame #15: + 0x5124446 (0x7f22d6978446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #16: + 0x1acf4b8 (0x7f22d33234b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #17: + 0x5aee004 (0x7f22d7342004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: frame #18: + 0x5af36b5 (0x7f22d73476b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank57]: frame #19: + 0xd2631e (0x7f22e9f3131e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank57]: frame #20: + 0x47def4 (0x7f22e9688ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34][default1]:[rank57]: frame #21: + 0x1445a6 (0x55c1ff0925a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55c1ff08ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #23: + 0x150866 (0x55c1ff09e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: return self._call_impl(*args, **kwargs[default1]:[rank57]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55c1ff087142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55c1ff092a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #26: PyObject_Call + 0xbc (0x55c1ff09ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ff0852b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55c1ff092a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ff0838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #30: + 0x150582 (0x55c1ff09e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ff0838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #32: + 0x150582 (0x55c1ff09e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ff0838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: sharded_logits = self.model( -[default1]:[rank57]: frame #34: + 0x150582 (0x55c1ff09e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ff0838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55c1ff08af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55c1ff09cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #38: + 0x211239 (0x55c1ff15f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55c1ff08ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55c1ff0873e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55c1ff092a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default1]:[rank57]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55c1ff082c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55c1ff092a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ff0838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: frame #45: + 0x150582 (0x55c1ff09e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #46: PyObject_Call + 0xbc (0x55c1ff09ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ff0852b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: frame #48: + 0x150582 (0x55c1ff09e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #49: PyObject_Call + 0xbc (0x55c1ff09ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ff0852b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55c1ff092a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55c1ff08b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55c1ff09cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #54: + 0x211239 (0x55c1ff15f239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #55: PyObject_Call + 0x207 (0x55c1ff09f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ff0852b3 in /fsx/ferdinandmom/miniforge3/envs/env-benc[default2]:[rank34]: return forward_call(*args, **kwargs) -h-cluster/bin/python3.10) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: frame #57: + 0x150582 (0x55c1ff09e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55c1ff0838fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #59: + 0x150582 (0x55c1ff09e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: frame #60: PyObject_Call + 0xbc (0x55c1ff09ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55c1ff0852b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #62: + 0x150582 (0x55c1ff09e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: frame #63: PyObject_Call + 0xbc (0x55c1ff09ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank57]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: pipeline_state.run_communication() -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default7]:[rank63]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank63]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7961e50897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #1: + 0x5b3a23e (0x7f799b96d23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank63]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f799b967c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f799b967f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f799b968fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f799b91d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: dist.recv( -[default7]:[rank63]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f799b91d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f799b91d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank63]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f799b91d371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f796312a189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank34]: dist.recv( -[default7]:[rank63]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f7963131610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f7963150978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank63]: frame #12: + 0x5adc309 (0x7f799b90f309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #13: + 0x5ae6f10 (0x7f799b919f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: frame #14: + 0x5ae6fa5 (0x7f799b919fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #15: + 0x5124446 (0x7f799af57446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #16: + 0x1acf4b8 (0x7f79979024b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: frame #17: + 0x5aee004 (0x7f799b921004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #18: + 0x5af36b5 (0x7f799b9266b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank63]: frame #19: + 0xd2631e (0x7f79ae51031e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank63]: frame #20: + 0x47def4 (0x7f79adc67ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: frame #21: + 0x1445a6 (0x55dcc28185a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55dcc2811a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #23: + 0x150866 (0x55dcc2824866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55dcc280d142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55dcc2818a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank63]: frame #26: PyObject_Call + 0xbc (0x55dcc2824f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55dcc280b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55dcc2818a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55dcc28098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank63]: frame #30: + 0x150582 (0x55dcc2824582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55dcc28098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #32: + 0x150582 (0x55dcc2824582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd630da2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55dcc28098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #34: + 0x150582 (0x55dcc2824582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55dcc28098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55dcc2810f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank63]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55dcc2822c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #38: + 0x211239 (0x55dcc28e5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #1: + 0x5b3a23e (0x7fd66a8bf23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55dcc2811a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55dcc280d3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55dcc2818a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f89a0e88897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank63]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55dcc2808c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55dcc2818a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55dcc28098fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fd66a8b9c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #45: + 0x150582 (0x55dcc2824582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #46: PyObject_Call + 0xbc (0x55dcc2824f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55dcc280b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #48: + 0x150582 (0x55dcc2824582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #49: PyObject_Call + 0xbc (0x55dcc2824f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55dcc280b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55dcc2818a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/pyt[default6]:[rank38]: frame #1: + 0x5b3a23e (0x7f89da9a523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -hon3.10) -[default7]:[rank63]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55dcc2811007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55dcc2822c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #54: + 0x211239 (0x55dcc28e5239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #55: PyObject_Call + 0x207 (0x55dcc2825067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55dcc280b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #57: + 0x150582 (0x55dcc2824582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55dcc28098fa in /fsx/ferdinandmom/miniforge3/envs[default2]:[rank34]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fd66a8b9f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #59: + 0x150582 (0x55dcc2824582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fd66a8bafd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd66a86f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank63]: frame #60: PyObject_Call + 0xbc (0x55dcc2824f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55dcc280b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #62: + 0x150582 (0x55dcc2824582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: frame #63: PyObject_Call + 0xbc (0x55dcc2824f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank63]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank38]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f89da99fc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default6]:[rank38]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f89da99ff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f89da9a0fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd66a86f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default[default6]:[rank38]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f89da955371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/p[default6]:[rank38]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f89da955371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -arallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrappe[default6]:[rank38]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f89da955371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -r -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank38]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f89da955371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: Traceback (most recent call last): -[default2]:[rank34]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd66a86f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default6]:[rank46]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff061c56897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank46]: frame #1: + 0x5b3a23e (0x7ff09b77323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff09b76dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff09b76df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]:[default7]:[rank39]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fd66a86f371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fd63207c189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) - frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff09b76efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff09b723371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff09b723371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff09b723371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff09b723371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[defaul[default3]:[rank35]: trainer.train(dataloader) -[default2]:[rank34]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fd632083610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f89a2162189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -t6]:[rank46]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff062f30189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff062f37610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff062f56978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank46]: frame #12: + 0x5adc309 (0x7ff09b715309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #13: + 0x5ae[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f89a2169610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank38]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f89a2188978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -6f10 (0x7ff09b71ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #14: + 0x5ae6fa5 (0x7ff09b71ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #15: + 0x5124446 (0x7ff09ad5d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #16: + 0x1acf4b8 (0x7ff0977084b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #17: + 0x5aee004 (0x7ff09b727004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #18: + 0x5af36b5 (0x7ff09b72c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluste[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: frame #12: + 0x5adc309 (0x7f89da947309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fd6320a2978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -r/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #19: + 0xd2631e (0x7ff0ae31631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #20: + 0x47def4 (0x7ff0ada6def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank46]: frame #21: + 0x1445a6 (0x5640a2a865a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5640a2a7fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #23: + 0x150866 (0x5640a2a92866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5640a2a7b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -/bin/python3.10) -[default6]:[rank46]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5640a2a86a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #26: PyObject_Call + 0xbc (0x5640a2a92f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5640a2a792b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5640a2a86a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5640a2a778fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #30: + 0x150582 (0x5640a2a92582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5640a2a778fa in /fsx/ferdinandmom/miniforge3/[default6]:[rank38]: frame #13: + 0x5ae6f10 (0x7f89da951f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #12: + 0x5adc309 (0x7fd66a861309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #32: + 0x150582 (0x5640a2a92582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5640a2a778fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #34: + 0x150582 (0x5640a2a92582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5640a2a778fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5640a2a7ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5640a2a90c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: frame #38: + 0x211239 (0x5640a2b53239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5640a2a7fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5640a2a7b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5640a2a86a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #13: + 0x5ae6f10 (0x7fd66a86bf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5640a2a76c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5640a2a86a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5640a2a778fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: frame #45: + 0x150582 (0x5640a2a92582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #46: PyObject_Call + 0xbc (0x5640a2a92f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5640a2a792b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #48: + 0x150582 (0x5640a2a92582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #14: + 0x5ae6fa5 (0x7fd66a86bfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #15: + 0x5124446 (0x7fd669ea9446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #49: PyObject_Call + 0xbc (0x5640a2a92f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5640a2a792b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #14: + 0x5ae6fa5 (0x7f89da951fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #16: + 0x1acf4b8 (0x7fd6668544b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5640a2a86a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5640a2a7f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5640a2a90c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #17: + 0x5aee004 (0x7fd66a873004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #54: + 0x211239 (0x5640a2b53239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #55: PyObject_Call + 0x207 (0x5640a2a93067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5640a2a792b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #57: + 0x150582 (0x5640a2a92582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #15: + 0x5124446 (0x7f89d9f8f446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank46]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5640a2a778fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #59: + 0x150582 (0x5640a2a92582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #60: PyObject_Call + 0xbc (0x5640a2a92f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #18: + 0x5af36b5 (0x7fd66a8786b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5640a2a792b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #62: + 0x150582 (0x5640a2a92582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: frame #63: PyObject_Call + 0xbc (0x5640a2a92f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank46]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default2]:[rank34]: frame #19: + 0xd2631e (0x7fd67d46231e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: frame #20: + 0x47def4 (0x7fd67cbb9ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #16: + 0x1acf4b8 (0x7f89d693a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #17: + 0x5aee004 (0x7f89da959004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #18: + 0x5af36b5 (0x7f89da95e6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: frame #19: + 0xd2631e (0x7f89ed54831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: frame #20: + 0x47def4 (0x7f89ecc9fef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank34]: frame #21: + 0x1445a6 (0x56188166a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #21: + 0x1445a6 (0x5602ba55d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: frame #22: _PyObject_MakeTpCall + 0x26b (0x561881663a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5602ba556a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #23: + 0x150866 (0x5602ba569866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5602ba552142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #23: + 0x150866 (0x561881676866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: sharded_logits = self.model( -[default6]:[rank38]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5602ba55da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56188165f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56188166aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #26: PyObject_Call + 0xbc (0x561881676f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default6]:[rank38]: frame #26: PyObject_Call + 0xbc (0x5602ba569f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56188165d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5602ba5502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5602ba55da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default2]:[rank34]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56188166aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5602ba54e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #30: + 0x150582 (0x5602ba569582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: sharded_logits = self.model( -[default6]:[rank38]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5602ba54e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: frame #32: + 0x150582 (0x5602ba569582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default6]:[rank38]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5602ba54e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56188165b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default2]:[rank34]: frame #30: + 0x150582 (0x561881676582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56188165b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #32: + 0x150582 (0x561881676582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: frame #34: + 0x150582 (0x5602ba569582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56188165b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: frame #34: + 0x150582 (0x561881676582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5602ba54e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56188165b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank44]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4de0a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]: frame #1: + 0x5b3a23e (0x7fb517bc323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x561881662f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb517bbdc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb517bbdf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb517bbefd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb517b73371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb517b73371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default2]:[rank34]: frame #37: _PyObject_Call_Prepend + 0x69 (0x561881674c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5602ba555f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5602ba567c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #38: + 0x211239 (0x5602ba62a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb517b73371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb517b73371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default4]:[rank44]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb4df380189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb4df387610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb4df3a6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank44]: frame #12: + 0x5adc309 (0x7fb517b65309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #13: [default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -+ 0x5ae6f10 (0x7fb517b6ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank38]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5602ba556a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #14: + 0x5ae6fa5 (0x7fb517b6ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #15: + 0x5124446 (0x7fb5171ad446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #16: + 0x1acf4b8 (0x7fb513b584b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #17: + 0x5aee004 (0x7fb517b77004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #18: + 0x5af36b5 (0x7fb517b7c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank44]: frame #19: + 0xd2631e (0x7fb52a766[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank44]: frame #20: + 0x47def4 (0x7fb529ebdef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank38]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5602ba5523e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #21: + 0x1445a6 (0x5623494675a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562349460a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #23: + 0x150866 (0x562349473866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56234945c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5602ba55da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562349467a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #26: PyObject_Call + 0xbc (0x562349473f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56234945a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562349467a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5623494588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #30: + 0x150582 (0x562349473582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5623494588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default4]:[rank44]: frame #32: + 0x150582 (0x562349473582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5623494588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #34: + 0x150582 (0x562349473582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5623494588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56234945ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562349471c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #38: + 0x211239 (0x562349534239 in /fsx/ferdinandmom/miniforge3/envs/e[default2]:[rank34]: frame #38: + 0x211239 (0x561881737239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #39: _PyObject_MakeTpCall + 0x26b (0x561881663a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -nv-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562349460a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56234945c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56188165f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562349467a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562349457c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562349467a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: pipeline_state.run_communication() -[default4]:[rank44]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5623494588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #45: + 0x150582 (0x562349473582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #46: PyObject_Call + 0xbc (0x562349473f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56234945a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5602ba54dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #48: + 0x150582 (0x562349473582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #49: PyObject_Call + 0xbc (0x562349473f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56234945a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5602ba55da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562349467a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562349460007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562349471c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #54: + 0x211239 (0x562349534239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #55: PyObject_Call + 0x207 (0x562349474067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56234945a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #57: + 0x150582 (0x562349473582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cl[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -uster/bin/python3.10) -[default4]:[rank44]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5623494588fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #59: + 0x150582 (0x562349473582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5602ba54e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: frame #60: PyObject_Call + 0xbc (0x562349473f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56234945a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #62: + 0x150582 (0x562349473582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: frame #63: PyObject_Call + 0xbc (0x562349473f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: frame #45: + 0x150582 (0x5602ba569582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank44]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank38]: frame #46: PyObject_Call + 0xbc (0x5602ba569f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5602ba5502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default7]:[rank55]: Traceback (most recent call last): -[default6]:[rank38]: frame #48: + 0x150582 (0x5602ba569582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: frame #49: PyObject_Call + 0xbc (0x5602ba569f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5602ba5502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56188166aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5602ba55da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56188165ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56188166aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5602ba556007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5602ba567c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: frame #54: + 0x211239 (0x5602ba62a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: frame #55: PyObject_Call + 0x207 (0x5602ba56a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5602ba5502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #57: + 0x150582 (0x5602ba569582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5602ba54e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56188165b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: dist.recv( -[default2]:[rank34]: frame #45: + 0x150582 (0x561881676582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: frame #59: + 0x150582 (0x5602ba569582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return func(*args, **kwargs) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: frame #60: PyObject_Call + 0xbc (0x5602ba569f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank38]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5602ba5502b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: frame #62: + 0x150582 (0x5602ba569582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank38]: frame #63: PyObject_Call + 0xbc (0x5602ba569f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank39]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank34]: frame #46: PyObject_Call + 0xbc (0x561881676f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56188165d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: frame #48: + 0x150582 (0x561881676582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0a58b49897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: frame #1: + 0x5b3a23e (0x7f0a9266623e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwarg[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default3]:[rank35]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -s) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780,[default6]:[rank54]: return forward_call(*args, **kwargs) -[default2]:[rank34]: frame #49: PyObject_Call + 0xbc (0x561881676f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) - in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: p[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0a92660c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ipeline_state.run_communication() -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff0d626c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: Fi[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0a92660f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -le "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank41]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank41]: frame #0: c10::Error::E[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0a92661fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56188165d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -rror(c10::SourceLocation, std::string) + 0x57 (0x7effc6a7c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank41]: frame #1: + 0x5b3a23e (0x7f000059923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f0000593c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: frame #1: + 0x5b3a23e (0x7ff10fd8923e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f0000593f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f0000594fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0000549371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0000549371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0a92616371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff10fd83c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0000549371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0000549371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7effc7d56189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56188166aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x561881663007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7effc7d5d610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7effc7d7c978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #12: + 0x5adc309 (0x7f000053b309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff10fd83f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff10fd84fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #13: + 0x5ae6f10 (0x7f0000545f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: frame #53: _PyObject_Call_Prepend + 0x69 (0x561881674c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #14: + 0x5ae6fa5 (0x7f0000545fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default7]:[rank39]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0a92616371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0a92616371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #15: + 0x5124446 (0x7effffb83446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #16: + 0x1acf4b8 (0x7efffc52e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #17: + 0x5aee004 (0x7f000054d004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0a92616371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #18: + 0x5af36b5 (0x7f00005526b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #19: + 0xd2631e (0x7f001313c31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default1]:[rank41]: frame #20: + 0x47def4 (0x7f0012893ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff10fd39371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff10fd39371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #21: + 0x1445a6 (0x55eb545335a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55eb5452ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #23: + 0x150866 (0x55eb5453f866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55eb54528142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: frame #54: + 0x211239 (0x561881737239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55eb54533a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #26: PyObject_Call + 0xbc (0x55eb5453ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwarg[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f0a59e23189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb545262b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55eb54533a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55eb545248fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -s) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: frame #55: PyObject_Call + 0x207 (0x561881677067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56188165d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff10fd39371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #30: + 0x150582 (0x55eb5453f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55eb545248fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0a59e2a610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #32: + 0x150582 (0x55eb5453f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55eb545248fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #34: + 0x150582 (0x55eb5453f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55eb545248fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55eb5452bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) - forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff10fd39371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55eb5453dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #38: + 0x211239 (0x55eb54600239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55eb5452ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f0a59e49978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55eb545283e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55eb54533a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55eb54523c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff0d7546189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff0d754d610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank41]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55eb54533a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55eb545248fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #45: + 0x150582 (0x55eb5453f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #46: PyObject_Call + 0xbc (0x55eb5453ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb545262b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: frame #57: + 0x150582 (0x561881676582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #48: + 0x150582 (0x55eb5453f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #49: PyObject_Call + 0xbc (0x55eb5453ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb545262b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56188165b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank34]: frame #59: + 0x150582 (0x561881676582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55eb54533a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55eb5452c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55eb5453dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #54: + 0x211239 (0x55eb54600239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default3]:[rank35]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff0d756c978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #12: + 0x5adc309 (0x7ff10fd2b309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #55: PyObject_Call + 0x207 (0x55eb54540067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb545262b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: frame #12: + 0x5adc309 (0x7f0a92608309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank34]: frame #60: PyObject_Call + 0xbc (0x561881676f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #13: + 0x5ae6f10 (0x7ff10fd35f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank41]: frame #57: + 0x150582 (0x55eb5453f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55eb545248fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #59: + 0x150582 (0x55eb5453f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #60: PyObject_Call + 0xbc (0x55eb5453ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55eb545262b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #62: + 0x150582 (0x55eb5453f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank41]: frame #63: PyObject_Call + 0xbc (0x55eb5453ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/pyt[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56188165d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -hon3.10) -[default1]:[rank41]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: frame #62: + 0x150582 (0x561881676582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank34]: frame #63: PyObject_Call + 0xbc (0x561881676f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: frame #13: + 0x5ae6f10 (0x7f0a92612f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: frame #14: + 0x5ae6fa5 (0x7f0a92612fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default7]:[rank55]: dist.recv( -[default7]:[rank39]: frame #15: + 0x5124446 (0x7f0a91c50446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #16: + 0x1acf4b8 (0x7f0a8e5fb4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank39]: frame #17: + 0x5aee004 (0x7f0a9261a004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #18: + 0x5af36b5 (0x7f0a9261f6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: frame #19: + 0xd2631e (0x7f0aa520931e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: frame #20: + 0x47def4 (0x7f0aa4960ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: pipeline_state.run_communication() -[default3]:[rank35]: frame #14: + 0x5ae6fa5 (0x7ff10fd35fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #15: + 0x5124446 (0x7ff10f373446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank35]: frame #16: + 0x1acf4b8 (0x7ff10bd1e4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #17: + 0x5aee004 (0x7ff10fd3d004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: frame #18: + 0x5af36b5 (0x7ff10fd426b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #19: + 0xd2631e (0x7ff12292c31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank55]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb4c6168897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank35]: frame #20: + 0x47def4 (0x7ff122083ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #21: + 0x1445a6 (0x56124feba5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56124feb3a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #23: + 0x150866 (0x56124fec6866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56124feaf142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56124febaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #26: PyObject_Call + 0xbc (0x56124fec6f1c in /fsx/ferdina[default7]:[rank55]: frame #1: + 0x5b3a23e (0x7fb4ffc8523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -ndmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: Traceback (most recent call last): -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fb4ffc7fc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56124fead2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56124febaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: frame #21: + 0x1445a6 (0x56186c1c85a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56186c1c1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fb4ffc7ff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fb4ffc80fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #23: + 0x150866 (0x56186c1d4866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56186c1bd142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56186c1c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb4ffc35371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #26: PyObject_Call + 0xbc (0x56186c1d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: pipeline_state.run_communication() -[default7]:[rank55]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb4ffc35371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56124feab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56186c1bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb4ffc35371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #30: + 0x150582 (0x56124fec6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56186c1c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fb4ffc35371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56124feab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: frame #32: + 0x150582 (0x56124fec6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56124feab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fb4c7442189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #34: + 0x150582 (0x56124fec6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fb4c7449610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fb4c7468978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank39]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56186c1b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: frame #12: + 0x5adc309 (0x7fb4ffc27309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56124feab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #13: + 0x5ae6f10 (0x7fb4ffc31f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #30: + 0x150582 (0x56186c1d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank35]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56124feb2f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: dist.recv( -[default7]:[rank39]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56186c1b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: frame #14: + 0x5ae6fa5 (0x7fb4ffc31fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56124fec4c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: frame #32: + 0x150582 (0x56186c1d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: frame #15: + 0x5124446 (0x7fb4ff26f446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #38: + 0x211239 (0x56124ff87239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56124feb3a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: dist.recv( -[default6]:[rank54]: return func(*args, **kwargs) -[default3]:[rank35]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56124feaf3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56124febaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56186c1b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: frame #34: + 0x150582 (0x56186c1d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank54]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank39]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56186c1b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default7]:[rank55]: frame #16: + 0x1acf4b8 (0x7fb4fbc1a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56124feaac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6e6954d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank39]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56186c1c0f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56186c1d2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: return func(*args, **kwargs) -[default6]:[rank54]: frame #1: + 0x5b3a23e (0x7f6ea306a23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56124febaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56124feab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #45: + 0x150582 (0x56124fec6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank54]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6ea3064c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #17: + 0x5aee004 (0x7fb4ffc39004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #18: + 0x5af36b5 (0x7fb4ffc3e6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6ea3064f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #19: + 0xd2631e (0x7fb51282831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch[default3]:[rank35]: frame #46: PyObject_Call + 0xbc (0x56124fec6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56124fead2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -/lib/libtorch_python.so) -[default3]:[rank35]: frame #48: + 0x150582 (0x56124fec6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #49: PyObject_Call + 0xbc (0x56124fec6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #20: + 0x47def4 (0x7fb511f7fef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank35]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56124fead2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56124febaa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #21: + 0x1445a6 (0x563ee24a85a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563ee24a1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #38: + 0x211239 (0x56186c295239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: Traceback (most recent call last): -[default6]:[rank54]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6ea3065fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56124feb3007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fec3a596897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: frame #23: + 0x150866 (0x563ee24b4866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563ee249d142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6ea301a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56186c1c1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6ea301a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56124fec4c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default6]:[rank54]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6ea301a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563ee24a8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56186c1bd3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #1: + 0x5b3a23e (0x7fec740b323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: frame #26: PyObject_Call + 0xbc (0x563ee24b4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6ea301a371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #54: + 0x211239 (0x56124ff87239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: Traceback (most recent call last): -[default2]:[rank58]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank58]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff4d662a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:[rank55]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563ee249b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56186c1c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: frame #1: + 0x5b3a23e (0x7ff51014723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563ee24a8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6e6a827189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #55: PyObject_Call + 0x207 (0x56124fec7067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fec740adc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563ee24998fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56186c1b8c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7ff510141c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6e6a82e610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank35]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56124fead2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6e6a84d978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #30: + 0x150582 (0x563ee24b4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56186c1c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: trainer.train(dataloader) -[default6]:[rank54]: frame #12: + 0x5adc309 (0x7f6ea300c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #13: + 0x5ae6f10 (0x7f6ea3016f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #57: + 0x150582 (0x56124fec6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56124feab8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563ee24998fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56186c1b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7ff510141f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7ff510142fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #32: + 0x150582 (0x563ee24b4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #59: + 0x150582 (0x56124fec6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fec740adf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #14: + 0x5ae6fa5 (0x7f6ea3016fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #15: + 0x5124446 (0x7f6ea2654446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank39]: frame #45: + 0x150582 (0x56186c1d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: frame #16: + 0x1acf4b8 (0x7f6e9efff4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563ee24998fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #34: + 0x150582 (0x563ee24b4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #60: PyObject_Call + 0xbc (0x56124fec6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff5100f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563ee24998fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563ee24a0f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563ee24b2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #46: PyObject_Call + 0xbc (0x56186c1d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff5100f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #38: + 0x211239 (0x563ee2575239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563ee24a1a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56124fead2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: Traceback (most recent call last): -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563ee249d3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563ee24a8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56186c1bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default7]:[rank55]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563ee2498c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #17: + 0x5aee004 (0x7f6ea301e004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #62: + 0x150582 (0x56124fec6582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff5100f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563ee24a8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #48: + 0x150582 (0x56186c1d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: output = model(**micro_batch) -[default2]:[rank58]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7ff5100f7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fec740aefd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #18: + 0x5af36b5 (0x7f6ea30236b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank35]: frame #63: PyObject_Call + 0xbc (0x56124fec6f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: frame #19: + 0xd2631e (0x7f6eb5c0d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank55]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563ee24998fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #49: PyObject_Call + 0xbc (0x56186c1d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56186c1bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: frame #45: + 0x150582 (0x563ee24b4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #46: PyObject_Call + 0xbc (0x563ee24b4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56186c1c8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank35]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: frame #20: + 0x47def4 (0x7f6eb5364ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank39]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56186c1c1007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56186c1d2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563ee249b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #54: + 0x211239 (0x56186c295239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #55: PyObject_Call + 0x207 (0x56186c1d5067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56186c1bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fec74063371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: trainer.train(dataloader) -[default6]:[rank54]: frame #21: + 0x1445a6 (0x5592ae1215a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #57: + 0x150582 (0x56186c1d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56186c1b98fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #59: + 0x150582 (0x56186c1d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #60: PyObject_Call + 0xbc (0x56186c1d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56186c1bb2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fec74063371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7ff4d7904189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #48: + 0x150582 (0x563ee24b4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #62: + 0x150582 (0x56186c1d4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: frame #63: PyObject_Call + 0xbc (0x56186c1d4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5592ae11aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank39]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: frame #49: PyObject_Call + 0xbc (0x563ee24b4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fec74063371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #23: + 0x150866 (0x5592ae12d866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5592ae116142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5592ae121a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #26: PyObject_Call + 0xbc (0x5592ae12df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5592ae1142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5592ae121a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563ee249b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563ee24a8a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default2]:[rank58]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7ff4d790b610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7ff4d792a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563ee24a1007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #12: + 0x5adc309 (0x7ff5100e9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5592ae1128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default7]:[rank55]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563ee24b2c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #54: + 0x211239 (0x563ee2575239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: frame #30: + 0x150582 (0x5592ae12d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: sharded_logits = self.model( -[default7]:[rank55]: frame #55: PyObject_Call + 0x207 (0x563ee24b5067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5592ae1128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563ee249b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: sharded_logits = self.model( -[default0]:[rank56]: output = model(**micro_batch) -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: frame #32: + 0x150582 (0x5592ae12d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: frame #13: + 0x5ae6f10 (0x7ff5100f3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #57: + 0x150582 (0x563ee24b4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563ee24998fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #59: + 0x150582 (0x563ee24b4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5592ae1128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fec74063371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fec3b870189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #60: PyObject_Call + 0xbc (0x563ee24b4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: frame #34: + 0x150582 (0x5592ae12d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5592ae1128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #14: + 0x5ae6fa5 (0x7ff5100f3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #15: + 0x5124446 (0x7ff50f731446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank55]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563ee249b2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default2]:[rank58]: frame #16: + 0x1acf4b8 (0x7ff50c0dc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5592ae119f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank55]: frame #62: + 0x150582 (0x563ee24b4582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5592ae12bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fec3b877610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank59]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fec3b896978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank55]: frame #63: PyObject_Call + 0xbc (0x563ee24b4f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: output = model(**micro_batch) -[default6]:[rank54]: frame #38: + 0x211239 (0x5592ae1ee239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default6]:[rank54]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5592ae11aa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5592ae1163e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5592ae121a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5592ae111c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5592ae121a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5592ae1128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: frame #17: + 0x5aee004 (0x7ff5100fb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: frame #45: + 0x150582 (0x5592ae12d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #46: PyObject_Call + 0xbc (0x5592ae12df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5592ae1142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: frame #48: + 0x150582 (0x5592ae12d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #49: PyObject_Call + 0xbc (0x5592ae12df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5592ae1142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: frame #12: + 0x5adc309 (0x7fec74055309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #13: + 0x5ae6f10 (0x7fec7405ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5592ae121a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5592ae11a007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5592ae12bc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #54: + 0x211239 (0x5592ae1ee239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank40]: pipeline_state.run_communication() -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: frame #18: + 0x5af36b5 (0x7ff5101006b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default6]:[rank54]: frame #55: PyObject_Call + 0x207 (0x5592ae12e067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5592ae1142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #57: + 0x150582 (0x5592ae12d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: frame #19: + 0xd2631e (0x7ff522cea31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default6]:[rank54]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5592ae1128fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #59: + 0x150582 (0x5592ae12d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #60: PyObject_Call + 0xbc (0x5592ae12df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5592ae1142b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #62: + 0x150582 (0x5592ae12d582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: frame #63: PyObject_Call + 0xbc (0x5592ae12df1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default6]:[rank54]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default0]:[rank40]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe65971a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: frame #20: + 0x47def4 (0x7ff522441ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #1: + 0x5b3a23e (0x7fe69323723e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: frame #21: + 0x1445a6 (0x55e401a435a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fe693231c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55e401a3ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #23: + 0x150866 (0x55e401a4f866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fe693231f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fe693232fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55e401a38142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55e401a43a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe6931e7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #14: + 0x5ae6fa5 (0x7fec7405ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #15: + 0x5124446 (0x7fec7369d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe6931e7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe6931e7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fe6931e7371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fe65a9f4189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libt[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -orch_cuda.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fe65a9fb610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank58]: frame #26: PyObject_Call + 0xbc (0x55e401a4ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55e401a362b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fe65aa1a978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank40]: frame #12: + 0x5adc309 (0x7fe6931d9309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #13: + 0x5ae6f10 (0x7fe6931e3f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: return func(*args, **kwargs) -[default2]:[rank58]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55e401a43a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #14: + 0x5ae6fa5 (0x7fe6931e3fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55e401a348fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank40]: frame #15: + 0x5124446 (0x7fe692821446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: frame #30: + 0x150582 (0x55e401a4f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55e401a348fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #16: + 0x1acf4b8 (0x7fe68f1cc4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank45]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8bc0a4b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]: frame #1: + 0x5b3a23e (0x7f8bfa56823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f8bfa562c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f8bfa562f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f8bfa563fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #17: + 0x5aee004 (0x7fe6931eb004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8bfa518371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #18: + 0x5af36b5 (0x7fe6931f06b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #32: + 0x150582 (0x55e401a4f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8bfa518371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55e401a348fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #19: + 0xd2631e (0x7fe6a5dda31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: frame #16: + 0x1acf4b8 (0x7fec700484b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #17: + 0x5aee004 (0x7fec74067004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #20: + 0x47def4 (0x7fe6a5531ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8bfa518371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank40]: frame #21: + 0x1445a6 (0x558af15e25a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #22: _PyObject_MakeTpCall + 0x26b (0x558af15dba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f8bfa518371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: frame #23: + 0x150866 (0x558af15ee866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x558af15d7142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f8bc1d25189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default0]:[rank40]: frame #25: _PyFunction_Vectorcall + 0x6c (0x558af15e2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #26: PyObject_Call + 0xbc (0x558af15eef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x558af15d52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #28: _PyFunction_Vectorcall + 0x6c (0x558af15e2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x558af15d38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f8bc1d2c610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f8bc1d4b978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: frame #12: + 0x5adc309 (0x7f8bfa50a309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #30: + 0x150582 (0x558af15ee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank45]: frame #13: + 0x5ae6f10 (0x7f8bfa514f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #34: + 0x150582 (0x55e401a4f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x558af15d38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #32: + 0x150582 (0x558af15ee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x558af15d38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: frame #34: + 0x150582 (0x558af15ee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: frame #14: + 0x5ae6fa5 (0x7f8bfa514fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55e401a348fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55e401a3bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x558af15d38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55e401a4dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #15: + 0x5124446 (0x7f8bf9b52446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x558af15daf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: frame #16: + 0x1acf4b8 (0x7f8bf64fd4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #38: + 0x211239 (0x55e401b10239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #17: + 0x5aee004 (0x7f8bfa51c004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #18: + 0x5af36b5 (0x7f8bfa5216b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: frame #37: _PyObject_Call_Prepend + 0x69 (0x558af15ecc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55e401a3ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55e401a383e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55e401a43a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #38: + 0x211239 (0x558af16af239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #19: + 0xd2631e (0x7f8c0d10b31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: frame #39: _PyObject_MakeTpCall + 0x26b (0x558af15dba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #18: + 0x5af36b5 (0x7fec7406c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: frame #20: + 0x47def4 (0x7f8c0c862ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank45]: frame #21: + 0x1445a6 (0x562cdb2125a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank40]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x558af15d73e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562cdb20ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: frame #41: _PyFunction_Vectorcall + 0x6c (0x558af15e2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x558af15d2c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55e401a33c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: frame #43: _PyFunction_Vectorcall + 0x6c (0x558af15e2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: dist.recv( -[default3]:[rank59]: frame #19: + 0xd2631e (0x7fec86c5631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank40]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x558af15d38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #45: + 0x150582 (0x558af15ee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55e401a43a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #23: + 0x150866 (0x562cdb21e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x562cdb207142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562cdb212a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #20: + 0x47def4 (0x7fec863adef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #21: + 0x1445a6 (0x562d20c325a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #26: PyObject_Call + 0xbc (0x562cdb21ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55e401a348fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x562cdb2052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #45: + 0x150582 (0x55e401a4f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #46: PyObject_Call + 0xbc (0x558af15eef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x558af15d52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #48: + 0x150582 (0x558af15ee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: return func(*args, **kwargs) -[default5]:[rank45]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562cdb212a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: dist.recv( -[default0]:[rank40]: frame #49: PyObject_Call + 0xbc (0x558af15eef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #46: PyObject_Call + 0xbc (0x55e401a4ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank45]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x562cdb2038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: frame #30: + 0x150582 (0x562cdb21e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x558af15d52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank45]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x562cdb2038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #32: + 0x150582 (0x562cdb21e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: frame #51: _PyFunction_Vectorcall + 0x6c (0x558af15e2a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55e401a362b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #22: _PyObject_MakeTpCall + 0x26b (0x562d20c2ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #23: + 0x150866 (0x562d20c3e866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x558af15db007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #53: _PyObject_Call_Prepend + 0x69 (0x558af15ecc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #48: + 0x150582 (0x55e401a4f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x562cdb2038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: trainer.train(dataloader) -[default5]:[rank45]: frame #34: + 0x150582 (0x562cdb21e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x562cdb2038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562cdb20af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: frame #54: + 0x211239 (0x558af16af239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank61]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank45]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562cdb21cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default5]:[rank45]: frame #38: + 0x211239 (0x562cdb2df239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x562d20c27142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #25: _PyFunction_Vectorcall + 0x6c (0x562d20c32a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #55: PyObject_Call + 0x207 (0x558af15ef067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x558af15d52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #57: + 0x150582 (0x558af15ee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x558af15d38fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #49: PyObject_Call + 0xbc (0x55e401a4ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7efe723dd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank40]: frame #59: + 0x150582 (0x558af15ee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #1: + 0x5b3a23e (0x7efeabefa23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #26: PyObject_Call + 0xbc (0x562d20c3ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562cdb20ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55e401a362b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #60: PyObject_Call + 0xbc (0x558af15eef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x562cdb2073e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562cdb212a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562cdb202c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7efeabef4c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank45]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562cdb212a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55e401a43a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55e401a3c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x558af15d52b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7efeabef4f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank40]: frame #62: + 0x150582 (0x558af15ee582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4d4bac6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x562cdb2038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #1: + 0x5b3a23e (0x7f4d855e323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x562d20c252b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank40]: frame #63: PyObject_Call + 0xbc (0x558af15eef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #28: _PyFunction_Vectorcall + 0x6c (0x562d20c32a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55e401a4dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #45: + 0x150582 (0x562cdb21e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank56]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6ad1a46897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank40]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: frame #1: + 0x5b3a23e (0x7f6b0b56323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7efeabef5fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: frame #46: PyObject_Call + 0xbc (0x562cdb21ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efeabeaa371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default2]:[rank58]: frame #54: + 0x211239 (0x55e401b10239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x562cdb2052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #48: + 0x150582 (0x562cdb21e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f6b0b55dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f4d855ddc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #49: PyObject_Call + 0xbc (0x562cdb21ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f4d855ddf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efeabeaa371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #55: PyObject_Call + 0x207 (0x55e401a50067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f4d855defd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x562cdb2052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efeabeaa371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562cdb212a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7efeabeaa371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default5]:[rank45]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562cdb20b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f6b0b55df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562cdb21cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #54: + 0x211239 (0x562cdb2df239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x562d20c238fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4d85593371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: sharded_logits = self.model( -[default4]:[rank60]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4d85593371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7efe736b7189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7efe736be610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f6b0b55efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank45]: frame #55: PyObject_Call + 0x207 (0x562cdb21f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x562cdb2052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #57: + 0x150582 (0x562cdb21e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x562cdb2038fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55e401a362b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #57: + 0x150582 (0x55e401a4f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #59: + 0x150582 (0x562cdb21e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #60: PyObject_Call + 0xbc (0x562cdb21ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x562cdb2052b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #62: + 0x150582 (0x562cdb21e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: frame #63: PyObject_Call + 0xbc (0x562cdb21ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #30: + 0x150582 (0x562d20c3e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank45]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank60]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4d85593371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55e401a348fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: frame #59: + 0x150582 (0x55e401a4f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6b0b513371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6b0b513371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default5]:[rank61]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7efe736dd978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x562d20c238fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #32: + 0x150582 (0x562d20c3e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f4d85593371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank58]: frame #60: PyObject_Call + 0xbc (0x55e401a4ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: frame #12: + 0x5adc309 (0x7efeabe9c309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #13: + 0x5ae6f10 (0x7efeabea6f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x562d20c238fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #34: + 0x150582 (0x562d20c3e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank43]: dist.recv( -[default4]:[rank60]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f4d4cda0189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6b0b513371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f6b0b513371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55e401a362b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #62: + 0x150582 (0x55e401a4f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank58]: frame #63: PyObject_Call + 0xbc (0x55e401a4ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank43]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank60]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f4d4cda7610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f6ad2d20189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f6ad2d27610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58a8414897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank58]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank43]: frame #1: + 0x5b3a23e (0x7f58e1f3123e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f58e1f2bc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #14: + 0x5ae6fa5 (0x7efeabea6fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f58e1f2bf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f58e1f2cfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #15: + 0x5124446 (0x7efeab4e4446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f58e1ee1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f58e1ee1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f58e1ee1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #16: + 0x1acf4b8 (0x7efea7e8f4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f58e1ee1371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f58a96ee189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f58a96f5610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #17: + 0x5aee004 (0x7efeabeae004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #18: + 0x5af36b5 (0x7efeabeb36b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f58a9714978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank43]: frame #12: + 0x5adc309 (0x7f58e1ed3309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #13: + 0x5ae6f10 (0x7f58e1eddf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f6ad2d46978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: trainer.train(dataloader) -[default3]:[rank43]: frame #14: + 0x5ae6fa5 (0x7f58e1eddfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank59]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x562d20c238fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x562d20c2af50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: Traceback (most recent call last): -[default3]:[rank43]: frame #15: + 0x5124446 (0x7f58e151b446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #16: + 0x1acf4b8 (0x7f58ddec64b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #12: + 0x5adc309 (0x7f6b0b505309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f4d4cdc6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default3]:[rank43]: frame #17: + 0x5aee004 (0x7f58e1ee5004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #18: + 0x5af36b5 (0x7f58e1eea6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #19: + 0xd2631e (0x7efebea9d31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #20: + 0x47def4 (0x7efebe1f4ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #19: + 0xd2631e (0x7f58f4ad431e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank60]: frame #12: + 0x5adc309 (0x7f4d85585309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #20: + 0x47def4 (0x7f58f422bef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank43]: frame #21: + 0x1445a6 (0x55dd6ea805a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #13: + 0x5ae6f10 (0x7f4d8558ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55dd6ea79a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #23: + 0x150866 (0x55dd6ea8c866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55dd6ea75142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #13: + 0x5ae6f10 (0x7f6b0b50ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55dd6ea80a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #26: PyObject_Call + 0xbc (0x55dd6ea8cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55dd6ea732b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #37: _PyObject_Call_Prepend + 0x69 (0x562d20c3cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #38: + 0x211239 (0x562d20cff239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55dd6ea80a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55dd6ea718fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #14: + 0x5ae6fa5 (0x7f6b0b50ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #15: + 0x5124446 (0x7f6b0ab4d446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: Traceback (most recent call last): -[default3]:[rank43]: frame #30: + 0x150582 (0x55dd6ea8c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #14: + 0x5ae6fa5 (0x7f4d8558ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55dd6ea718fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #32: + 0x150582 (0x55dd6ea8c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #15: + 0x5124446 (0x7f4d84bcd446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55dd6ea718fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #39: _PyObject_MakeTpCall + 0x26b (0x562d20c2ba6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default3]:[rank43]: frame #34: + 0x150582 (0x55dd6ea8c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #16: + 0x1acf4b8 (0x7f6b074f84b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55dd6ea718fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: Traceback (most recent call last): -[default4]:[rank60]: frame #16: + 0x1acf4b8 (0x7f4d815784b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #17: + 0x5aee004 (0x7f4d85597004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank43]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55dd6ea78f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: frame #18: + 0x5af36b5 (0x7f4d8559c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #17: + 0x5aee004 (0x7f6b0b517004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #21: + 0x1445a6 (0x563b45c795a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563b45c72a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: trainer.train(dataloader) -[default7]:[rank47]: Traceback (most recent call last): -[default0]:[rank56]: frame #18: + 0x5af36b5 (0x7f6b0b51c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55dd6ea8ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: trainer.train(dataloader) -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: frame #38: + 0x211239 (0x55dd6eb4d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55dd6ea79a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55dd6ea753e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55dd6ea80a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55dd6ea70c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55dd6ea80a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #19: + 0xd2631e (0x7f6b1e10631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank59]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x562d20c273e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #19: + 0xd2631e (0x7f4d9818631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55dd6ea718fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #41: _PyFunction_Vectorcall + 0x6c (0x562d20c32a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #20: + 0x47def4 (0x7f6b1d85def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank61]: frame #23: + 0x150866 (0x563b45c85866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: frame #20: + 0x47def4 (0x7f4d978ddef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: frame #45: + 0x150582 (0x55dd6ea8c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #46: PyObject_Call + 0xbc (0x55dd6ea8cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #21: + 0x1445a6 (0x56450b81a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #21: + 0x1445a6 (0x55b1c13e95a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563b45c6e142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #22: _PyObject_MakeTpCall + 0x26b (0x56450b813a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x562d20c22c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55b1c13e2a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #23: + 0x150866 (0x55b1c13f5866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: frame #43: _PyFunction_Vectorcall + 0x6c (0x562d20c32a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563b45c79a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: output = model(**micro_batch) -[default5]:[rank61]: frame #26: PyObject_Call + 0xbc (0x563b45c85f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55b1c13de142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55dd6ea732b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55b1c13e9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x562d20c238fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #45: + 0x150582 (0x562d20c3e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #23: + 0x150866 (0x56450b826866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56450b80f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: output = model(**micro_batch) -[default3]:[rank59]: frame #46: PyObject_Call + 0xbc (0x562d20c3ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: frame #48: + 0x150582 (0x55dd6ea8c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563b45c6c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #26: PyObject_Call + 0xbc (0x55b1c13f5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x562d20c252b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default5]:[rank61]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563b45c79a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #49: PyObject_Call + 0xbc (0x55dd6ea8cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563b45c6a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #48: + 0x150582 (0x562d20c3e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56450b81aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #26: PyObject_Call + 0xbc (0x56450b826f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: frame #30: + 0x150582 (0x563b45c85582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563b45c6a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55dd6ea732b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #32: + 0x150582 (0x563b45c85582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56450b80d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #49: PyObject_Call + 0xbc (0x562d20c3ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x562d20c252b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: sharded_logits = self.model( -[default3]:[rank59]: frame #51: _PyFunction_Vectorcall + 0x6c (0x562d20c32a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55dd6ea80a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55dd6ea79007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55b1c13dc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55b1c13e9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55dd6ea8ac39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56450b81aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563b45c6a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55b1c13da8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: frame #54: + 0x211239 (0x55dd6eb4d239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x562d20c2b007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #53: _PyObject_Call_Prepend + 0x69 (0x562d20c3cc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: frame #54: + 0x211239 (0x562d20cff239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: frame #55: PyObject_Call + 0x207 (0x55dd6ea8d067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55dd6ea732b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #57: + 0x150582 (0x55dd6ea8c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56450b80b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default3]:[rank59]: frame #55: PyObject_Call + 0x207 (0x562d20c3f067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: sharded_logits = self.model( -[default4]:[rank60]: frame #30: + 0x150582 (0x56450b826582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #30: + 0x150582 (0x55b1c13f5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x562d20c252b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55dd6ea718fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #57: + 0x150582 (0x562d20c3e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: output = model(**micro_batch) -[default3]:[rank43]: frame #59: + 0x150582 (0x55dd6ea8c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: frame #60: PyObject_Call + 0xbc (0x55dd6ea8cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55dd6ea732b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56450b80b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x562d20c238fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default0]:[rank56]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55b1c13da8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #32: + 0x150582 (0x55b1c13f5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: frame #59: + 0x150582 (0x562d20c3e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #60: PyObject_Call + 0xbc (0x562d20c3ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: frame #62: + 0x150582 (0x55dd6ea8c582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank43]: frame #63: PyObject_Call + 0xbc (0x55dd6ea8cf1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #34: + 0x150582 (0x563b45c85582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x562d20c252b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563b45c6a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563b45c71f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563b45c83c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: frame #62: + 0x150582 (0x562d20c3e582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank59]: frame #63: PyObject_Call + 0xbc (0x562d20c3ef1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default4]:[rank60]: frame #32: + 0x150582 (0x56450b826582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: frame #38: + 0x211239 (0x563b45d46239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563b45c72a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55b1c13da8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56450b80b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank59]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank61]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563b45c6e3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: frame #34: + 0x150582 (0x56450b826582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56450b80b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: output = model(**micro_batch) -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: frame #34: + 0x150582 (0x55b1c13f5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55b1c13da8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55b1c13e1f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563b45c79a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563b45c69c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563b45c79a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55b1c13f3c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563b45c6a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default0]:[rank56]: frame #38: + 0x211239 (0x55b1c14b6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x56450b812f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #37: _PyObject_Call_Prepend + 0x69 (0x56450b824c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55b1c13e2a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55b1c13de3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: frame #38: + 0x211239 (0x56450b8e7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55b1c13e9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: frame #39: _PyObject_MakeTpCall + 0x26b (0x56450b813a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56450b80f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: sharded_logits = self.model( -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: frame #45: + 0x150582 (0x563b45c85582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55b1c13d9c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55b1c13e9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56450b81aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56450b80ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return func(*args, **kwargs) -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: sharded_logits = self.model( -[default2]:[rank42]: dist.recv( -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55b1c13da8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default4]:[rank60]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56450b81aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56450b80b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank56]: frame #45: + 0x150582 (0x55b1c13f5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #46: PyObject_Call + 0xbc (0x563b45c85f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default7]:[rank47]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank56]: frame #46: PyObject_Call + 0xbc (0x55b1c13f5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcb18a17897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank56]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55b1c13dc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c275e8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank60]: frame #45: + 0x150582 (0x56450b826582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #46: PyObject_Call + 0xbc (0x56450b826f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default2]:[rank42]: frame #1: + 0x5b3a23e (0x7fcb5253423e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #48: + 0x150582 (0x55b1c13f5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default7]:[rank47]: frame #1: + 0x5b3a23e (0x7f1c6110523e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f1c610ffc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #49: PyObject_Call + 0xbc (0x55b1c13f5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563b45c6c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56450b80d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #48: + 0x150582 (0x56450b826582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1c610fff82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55b1c13dc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #48: + 0x150582 (0x563b45c85582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default2]:[rank42]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7fcb5252ec87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #49: PyObject_Call + 0xbc (0x56450b826f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1c61100fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #49: PyObject_Call + 0xbc (0x563b45c85f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7fcb5252ef82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7fcb5252ffd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55b1c13e9a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56450b80d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56450b81aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1c610b5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563b45c6c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcb524e4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563b45c79a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55b1c13e2007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1c610b5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1c610b5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x56450b813007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f1c610b5371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f1c288c2189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank56]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55b1c13f3c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f1c288c9610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #53: _PyObject_Call_Prepend + 0x69 (0x56450b824c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #54: + 0x211239 (0x56450b8e7239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return func(*args, **kwargs) -[default7]:[rank47]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f1c288e8978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #12: + 0x5adc309 (0x7f1c610a7309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #54: + 0x211239 (0x55b1c14b6239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default7]:[rank47]: frame #13: + 0x5ae6f10 (0x7f1c610b1f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #55: PyObject_Call + 0x207 (0x55b1c13f6067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcb524e4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563b45c72007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcb524e4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7fcb524e4371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #55: PyObject_Call + 0x207 (0x56450b827067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fcb19cf1189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fcb19cf8610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56450b80d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fcb19d17978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank61]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563b45c83c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #54: + 0x211239 (0x563b45d46239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #14: + 0x5ae6fa5 (0x7f1c610b1fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #57: + 0x150582 (0x56450b826582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #12: + 0x5adc309 (0x7fcb524d6309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56450b80b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #15: + 0x5124446 (0x7f1c606ef446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55b1c13dc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #13: + 0x5ae6f10 (0x7fcb524e0f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #57: + 0x150582 (0x55b1c13f5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #55: PyObject_Call + 0x207 (0x563b45c86067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: frame #16: + 0x1acf4b8 (0x7f1c5d09a4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55b1c13da8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: frame #17: + 0x5aee004 (0x7f1c610b9004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank56]: frame #59: + 0x150582 (0x55b1c13f5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #59: + 0x150582 (0x56450b826582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: frame #14: + 0x5ae6fa5 (0x7fcb524e0fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #60: PyObject_Call + 0xbc (0x56450b826f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: frame #18: + 0x5af36b5 (0x7f1c610be6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank60]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56450b80d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: frame #19: + 0xd2631e (0x7f1c73ca831e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #15: + 0x5124446 (0x7fcb51b1e446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank61]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563b45c6c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: frame #20: + 0x47def4 (0x7f1c733ffef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #21: + 0x1445a6 (0x5602721ec5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #60: PyObject_Call + 0xbc (0x55b1c13f5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: frame #16: + 0x1acf4b8 (0x7fcb4e4c94b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5602721e5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default2]:[rank42]: frame #17: + 0x5aee004 (0x7fcb524e8004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default2]:[rank42]: frame #18: + 0x5af36b5 (0x7fcb524ed6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #23: + 0x150866 (0x5602721f8866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank49]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank42]: frame #19: + 0xd2631e (0x7fcb650d731e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #20: + 0x47def4 (0x7fcb6482eef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5602721e1142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #62: + 0x150582 (0x56450b826582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: frame #63: PyObject_Call + 0xbc (0x56450b826f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #21: + 0x1445a6 (0x56045368a5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #57: + 0x150582 (0x563b45c85582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563b45c6a8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5602721eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank60]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default7]:[rank47]: frame #26: PyObject_Call + 0xbc (0x5602721f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #59: + 0x150582 (0x563b45c85582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #22: _PyObject_MakeTpCall + 0x26b (0x560453683a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #60: PyObject_Call + 0xbc (0x563b45c85f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55b1c13dc2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5602721df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563b45c6c2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #23: + 0x150866 (0x560453696866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #62: + 0x150582 (0x55b1c13f5582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5602721eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #62: + 0x150582 (0x563b45c85582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: trainer.train(dataloader) -[default2]:[rank42]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x56045367f142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: frame #63: PyObject_Call + 0xbc (0x563b45c85f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: frame #63: PyObject_Call + 0xbc (0x55b1c13f5f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: frame #25: _PyFunction_Vectorcall + 0x6c (0x56045368aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #26: PyObject_Call + 0xbc (0x560453696f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank56]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f14c75fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank42]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x56045367d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #28: _PyFunction_Vectorcall + 0x6c (0x56045368aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank61]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank49]: frame #1: + 0x5b3a23e (0x7f150111823e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: pipeline_state.run_communication() -[default2]:[rank42]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x56045367b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5602721dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: frame #30: + 0x150582 (0x560453696582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default7]:[rank47]: frame #30: + 0x150582 (0x5602721f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x56045367b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f1501112c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f1501112f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5602721dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: pipeline_state.run_communication() -[default2]:[rank42]: frame #32: + 0x150582 (0x560453696582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: frame #32: + 0x150582 (0x5602721f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x56045367b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5602721dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #34: + 0x150582 (0x560453696582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #34: + 0x150582 (0x5602721f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x56045367b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5602721dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x560453682f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5602721e4f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f1501113fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f15010c8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #37: _PyObject_Call_Prepend + 0x69 (0x560453694c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5602721f6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f15010c8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #38: + 0x211239 (0x560453757239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f15010c8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #38: + 0x211239 (0x5602722b9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: frame #39: _PyObject_MakeTpCall + 0x26b (0x560453683a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5602721e5a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x56045367f3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5602721e13e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f15010c8371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f14c88d5189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank42]: frame #41: _PyFunction_Vectorcall + 0x6c (0x56045368aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x56045367ac5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: frame #43: _PyFunction_Vectorcall + 0x6c (0x56045368aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5602721eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x56045367b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #45: + 0x150582 (0x560453696582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5602721dcc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #46: PyObject_Call + 0xbc (0x560453696f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5602721eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x56045367d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5602721dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #48: + 0x150582 (0x560453696582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #49: PyObject_Call + 0xbc (0x560453696f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x56045367d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #51: _PyFunction_Vectorcall + 0x6c (0x56045368aa2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x560453683007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: frame #45: + 0x150582 (0x5602721f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: frame #46: PyObject_Call + 0xbc (0x5602721f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5602721df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f14c88dc610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f14c88fb978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:[rank47]: frame #48: + 0x150582 (0x5602721f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: frame #12: + 0x5adc309 (0x7f15010ba309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #13: + 0x5ae6f10 (0x7f15010c4f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #49: PyObject_Call + 0xbc (0x5602721f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5602721df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5602721eca2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: frame #53: _PyObject_Call_Prepend + 0x69 (0x560453694c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #14: + 0x5ae6fa5 (0x7f15010c4fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default7]:[rank47]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5602721e5007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #15: + 0x5124446 (0x7f1500702446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank42]: frame #54: + 0x211239 (0x560453757239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #55: PyObject_Call + 0x207 (0x560453697067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x56045367d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank42]: frame #57: + 0x150582 (0x560453696582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x56045367b8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5602721f6c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: frame #54: + 0x211239 (0x5602722b9239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #59: + 0x150582 (0x560453696582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #55: PyObject_Call + 0x207 (0x5602721f9067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5602721df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #57: + 0x150582 (0x5602721f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #60: PyObject_Call + 0xbc (0x560453696f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5602721dd8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default7]:[rank47]: frame #59: + 0x150582 (0x5602721f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #16: + 0x1acf4b8 (0x7f14fd0ad4b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #17: + 0x5aee004 (0x7f15010cc004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x56045367d2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #18: + 0x5af36b5 (0x7f15010d16b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #19: + 0xd2631e (0x7f1513cbb31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default7]:[rank47]: frame #60: PyObject_Call + 0xbc (0x5602721f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5602721df2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank42]: frame #62: + 0x150582 (0x560453696582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: frame #62: + 0x150582 (0x5602721f8582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #20: + 0x47def4 (0x7f1513412ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank42]: frame #63: PyObject_Call + 0xbc (0x560453696f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: frame #63: PyObject_Call + 0xbc (0x5602721f8f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: dist.recv( -[default2]:[rank42]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: dist.recv( -[default7]:[rank47]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default2]:[rank50]: dist.recv( -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: return func(*args, **kwargs) -[default1]:[rank49]: frame #21: + 0x1445a6 (0x55848a0ed5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default2]:[rank50]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default1]:[rank49]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55848a0e6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #23: + 0x150866 (0x55848a0f9866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55848a0e2142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default0]:[rank48]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default4]:[rank52]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank50]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fefb9ae6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:[rank50]: frame #1: + 0x5b3a23e (0x7feff360323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f269b2a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank48]: frame #1: + 0x5b3a23e (0x7f26d4dc323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55848a0eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7feff35fdc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f044cc63897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: frame #1: + 0x5b3a23e (0x7f048678023e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f81599f6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank53]: frame #1: + 0x5b3a23e (0x7f819351323e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f26d4dbdc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default5]:[rank53]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f819350dc87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f048677ac87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f26d4dbdf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f819350df82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7feff35fdf82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7feff35fefd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f26d4dbefd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f26d4d73371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f26d4d73371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feff35b3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feff35b3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f048677af82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #26: PyObject_Call + 0xbc (0x55848a0f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feff35b3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7feff35b3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f048677bfd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f819350efd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7fefbadc0189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f26d4d73371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7fefbadc7610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:[rank50]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7fefbade6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0486730371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0486730371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0486730371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55848a0e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f81934c3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f81934c3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default5]:[rank53]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f81934c3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f0486730371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f044df3d189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f044df44610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f26d4d73371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f269c580189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f81934c3371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #12: + 0x5adc309 (0x7feff35a5309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #13: + 0x5ae6f10 (0x7feff35aff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f815acd0189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f815acd7610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank49]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55848a0eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f044df63978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f815acf6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank52]: frame #12: + 0x5adc309 (0x7f0486722309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #13: + 0x5ae6f10 (0x7f048672cf10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #14: + 0x5ae6fa5 (0x7feff35affa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #15: + 0x5124446 (0x7feff2bed446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #16: + 0x1acf4b8 (0x7fefef5984b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f269c587610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f269c5a6978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank53]: frame #12: + 0x5adc309 (0x7f81934b5309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 246, in _recv_meta -[default5]:[rank53]: frame #13: + 0x5ae6f10 (0x7f81934bff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #17: + 0x5aee004 (0x7feff35b7004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #14: + 0x5ae6fa5 (0x7f048672cfa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #12: + 0x5adc309 (0x7f26d4d65309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #18: + 0x5af36b5 (0x7feff35bc6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #14: + 0x5ae6fa5 (0x7f81934bffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #15: + 0x5124446 (0x7f0485d6a446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55848a0de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #30: + 0x150582 (0x55848a0f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #13: + 0x5ae6f10 (0x7f26d4d6ff10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55848a0de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #19: + 0xd2631e (0x7ff0061a631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #20: + 0x47def4 (0x7ff0058fdef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #21: + 0x1445a6 (0x563a64d465a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #22: _PyObject_MakeTpCall + 0x26b (0x563a64d3fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #23: + 0x150866 (0x563a64d52866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #16: + 0x1acf4b8 (0x7f04827154b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #17: + 0x5aee004 (0x7f0486734004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #18: + 0x5af36b5 (0x7f04867396b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: return func(*args, **kwargs) -[default1]:[rank49]: frame #32: + 0x150582 (0x55848a0f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55848a0de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #19: + 0xd2631e (0x7f049932331e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #14: + 0x5ae6fa5 (0x7f26d4d6ffa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #15: + 0x5124446 (0x7f26d43ad446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: frame #20: + 0x47def4 (0x7f0498a7aef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x563a64d3b142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #25: _PyFunction_Vectorcall + 0x6c (0x563a64d46a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #26: PyObject_Call + 0xbc (0x563a64d52f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #16: + 0x1acf4b8 (0x7f26d0d584b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #17: + 0x5aee004 (0x7f26d4d77004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #21: + 0x1445a6 (0x55fa0b1835a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: frame #15: + 0x5124446 (0x7f8192afd446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #18: + 0x5af36b5 (0x7f26d4d7c6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x563a64d392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #34: + 0x150582 (0x55848a0f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55848a0de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #19: + 0xd2631e (0x7f26e796631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #20: + 0x47def4 (0x7f26e70bdef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default4]:[rank52]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55fa0b17ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #28: _PyFunction_Vectorcall + 0x6c (0x563a64d46a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #23: + 0x150866 (0x55fa0b18f866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55fa0b178142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55fa0b183a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #16: + 0x1acf4b8 (0x7f818f4a84b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #21: + 0x1445a6 (0x555dd8a7d5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55848a0e5f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55848a0f7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x563a64d378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #22: _PyObject_MakeTpCall + 0x26b (0x555dd8a76a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: torch.distributed.DistBackendError: [1] is setting up NCCL communicator and retrieving ncclUniqueId from [0] via c10d key-value store by key '0:1', but store->get('0:1') got error: Connection reset by peer -[default1]:[rank49]: frame #38: + 0x211239 (0x55848a1ba239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #30: + 0x150582 (0x563a64d52582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x563a64d378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #23: + 0x150866 (0x555dd8a89866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #26: PyObject_Call + 0xbc (0x55fa0b18ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #32: + 0x150582 (0x563a64d52582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55848a0e6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: Exception raised from recvBytes at ../torch/csrc/distributed/c10d/Utils.hpp:672 (most recent call first): -[default2]:[rank50]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x563a64d378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #17: + 0x5aee004 (0x7f81934c7004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #18: + 0x5af36b5 (0x7f81934cc6b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa0b1762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #34: + 0x150582 (0x563a64d52582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x555dd8a72142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55848a0e23e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55848a0eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x563a64d378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f022341f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank52]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55fa0b183a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55fa0b1748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #25: _PyFunction_Vectorcall + 0x6c (0x555dd8a7da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #1: + 0x5b3a23e (0x7f025cf3c23e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #19: + 0xd2631e (0x7f81a60b631e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default5]:[rank53]: frame #20: + 0x47def4 (0x7f81a580def4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default0]:[rank48]: frame #26: PyObject_Call + 0xbc (0x555dd8a89f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #30: + 0x150582 (0x55fa0b18f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #21: + 0x1445a6 (0x5607aa1775a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #22: _PyObject_MakeTpCall + 0x26b (0x5607aa170a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #2: c10d::TCPStore::doWait(c10::ArrayRef, std::chrono::duration >) + 0x2c7 (0x7f025cf36c87 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x563a64d3ef50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x555dd8a702b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #28: _PyFunction_Vectorcall + 0x6c (0x555dd8a7da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #23: + 0x150866 (0x5607aa183866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55fa0b1748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #32: + 0x150582 (0x55fa0b18f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55848a0ddc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x5607aa16c142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #25: _PyFunction_Vectorcall + 0x6c (0x5607aa177a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #37: _PyObject_Call_Prepend + 0x69 (0x563a64d50c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #38: + 0x211239 (0x563a64e13239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #39: _PyObject_MakeTpCall + 0x26b (0x563a64d3fa6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #3: c10d::TCPStore::doGet(std::string const&) + 0x32 (0x7f025cf36f82 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #4: c10d::TCPStore::get(std::string const&) + 0xa1 (0x7f025cf37fd1 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55848a0eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55848a0de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x555dd8a6e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #30: + 0x150582 (0x555dd8a89582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55fa0b1748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #5: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f025ceec371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #6: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f025ceec371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x555dd8a6e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #32: + 0x150582 (0x555dd8a89582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x555dd8a6e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #34: + 0x150582 (0x55fa0b18f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x563a64d3b3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #45: + 0x150582 (0x55848a0f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #34: + 0x150582 (0x555dd8a89582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x555dd8a6e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x555dd8a75f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #41: _PyFunction_Vectorcall + 0x6c (0x563a64d46a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #7: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f025ceec371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #8: c10d::PrefixStore::get(std::string const&) + 0x31 (0x7f025ceec371 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default1]:[rank49]: frame #46: PyObject_Call + 0xbc (0x55848a0f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55848a0e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #37: _PyObject_Call_Prepend + 0x69 (0x555dd8a87c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x563a64d36c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55fa0b1748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55fa0b17bf50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #43: _PyFunction_Vectorcall + 0x6c (0x563a64d46a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x563a64d378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #48: + 0x150582 (0x55848a0f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #9: c10d::ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId*, bool, std::string const&, int) + 0xa9 (0x7f02246f9189 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #10: c10d::ProcessGroupNCCL::getNCCLComm(std::string const&, c10::Device&, c10d::OpType, int, bool) + 0xc50 (0x7f0224700610 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank51]: frame #11: c10d::ProcessGroupNCCL::recv(std::vector >&, int, int) + 0x5f8 (0x7f022471f978 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank48]: frame #38: + 0x211239 (0x555dd8b4a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #39: _PyObject_MakeTpCall + 0x26b (0x555dd8a76a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x555dd8a723e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #49: PyObject_Call + 0xbc (0x55848a0f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #12: + 0x5adc309 (0x7f025cede309 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #26: PyObject_Call + 0xbc (0x5607aa183f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #45: + 0x150582 (0x563a64d52582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #46: PyObject_Call + 0xbc (0x563a64d52f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #13: + 0x5ae6f10 (0x7f025cee8f10 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55fa0b18dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #38: + 0x211239 (0x55fa0b250239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55fa0b17ca6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55fa0b1783e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55fa0b183a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x563a64d392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #48: + 0x150582 (0x563a64d52582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #41: _PyFunction_Vectorcall + 0x6c (0x555dd8a7da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x555dd8a6dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #43: _PyFunction_Vectorcall + 0x6c (0x555dd8a7da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x555dd8a6e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55fa0b173c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #14: + 0x5ae6fa5 (0x7f025cee8fa5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default5]:[rank53]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x5607aa16a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #49: PyObject_Call + 0xbc (0x563a64d52f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x563a64d392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #51: _PyFunction_Vectorcall + 0x6c (0x563a64d46a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #45: + 0x150582 (0x555dd8a89582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #28: _PyFunction_Vectorcall + 0x6c (0x5607aa177a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x5607aa1688fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #30: + 0x150582 (0x5607aa183582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x563a64d3f007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55848a0e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55fa0b183a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55fa0b1748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #53: _PyObject_Call_Prepend + 0x69 (0x563a64d50c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #54: + 0x211239 (0x563a64e13239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #15: + 0x5124446 (0x7f025c526446 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default0]:[rank48]: frame #46: PyObject_Call + 0xbc (0x555dd8a89f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55848a0eda2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55848a0e6007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55848a0f7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #16: + 0x1acf4b8 (0x7f0258ed14b8 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default4]:[rank52]: frame #45: + 0x150582 (0x55fa0b18f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #46: PyObject_Call + 0xbc (0x55fa0b18ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #17: + 0x5aee004 (0x7f025cef0004 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default3]:[rank51]: frame #18: + 0x5af36b5 (0x7f025cef56b5 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so) -[default2]:[rank50]: frame #55: PyObject_Call + 0x207 (0x563a64d53067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x563a64d392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x555dd8a702b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #48: + 0x150582 (0x555dd8a89582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #57: + 0x150582 (0x563a64d52582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x563a64d378fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #54: + 0x211239 (0x55848a1ba239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #59: + 0x150582 (0x563a64d52582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa0b1762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #48: + 0x150582 (0x55fa0b18f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #19: + 0xd2631e (0x7f026fadf31e in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default2]:[rank50]: frame #60: PyObject_Call + 0xbc (0x563a64d52f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x563a64d392b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #62: + 0x150582 (0x563a64d52582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x5607aa1688fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #32: + 0x150582 (0x5607aa183582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #49: PyObject_Call + 0xbc (0x55fa0b18ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #49: PyObject_Call + 0xbc (0x555dd8a89f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x555dd8a702b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #20: + 0x47def4 (0x7f026f236ef4 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_python.so) -[default3]:[rank51]: frame #21: + 0x1445a6 (0x55d7e1aad5a6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #55: PyObject_Call + 0x207 (0x55848a0fa067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa0b1762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55fa0b183a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #51: _PyFunction_Vectorcall + 0x6c (0x555dd8a7da2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: frame #63: PyObject_Call + 0xbc (0x563a64d52f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default2]:[rank50]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default1]:[rank49]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55848a0e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #57: + 0x150582 (0x55848a0f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55848a0de8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #22: _PyObject_MakeTpCall + 0x26b (0x55d7e1aa6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #23: + 0x150866 (0x55d7e1ab9866 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x555dd8a76007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #53: _PyObject_Call_Prepend + 0x69 (0x555dd8a87c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55fa0b17c007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55fa0b18dc39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #24: _PyEval_EvalFrameDefault + 0x4c12 (0x55d7e1aa2142 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #59: + 0x150582 (0x55848a0f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #25: _PyFunction_Vectorcall + 0x6c (0x55d7e1aada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x5607aa1688fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #34: + 0x150582 (0x5607aa183582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #54: + 0x211239 (0x55fa0b250239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #55: PyObject_Call + 0x207 (0x55fa0b190067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #54: + 0x211239 (0x555dd8b4a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #55: PyObject_Call + 0x207 (0x555dd8a8a067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #26: PyObject_Call + 0xbc (0x55d7e1ab9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #60: PyObject_Call + 0xbc (0x55848a0f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55848a0e02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #62: + 0x150582 (0x55848a0f9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: frame #63: PyObject_Call + 0xbc (0x55848a0f9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x555dd8a702b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default1]:[rank49]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default4]:[rank52]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa0b1762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #57: + 0x150582 (0x55fa0b18f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55fa0b1748fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #59: + 0x150582 (0x55fa0b18f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #60: PyObject_Call + 0xbc (0x55fa0b18ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #27: _PyEval_EvalFrameDefault + 0x2d83 (0x55d7e1aa02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #28: _PyFunction_Vectorcall + 0x6c (0x55d7e1aada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x5607aa1688fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x5607aa16ff50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #29: _PyEval_EvalFrameDefault + 0x13ca (0x55d7e1a9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #30: + 0x150582 (0x55d7e1ab9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #37: _PyObject_Call_Prepend + 0x69 (0x5607aa181c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55fa0b1762b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #62: + 0x150582 (0x55fa0b18f582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #31: _PyEval_EvalFrameDefault + 0x13ca (0x55d7e1a9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #32: + 0x150582 (0x55d7e1ab9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #57: + 0x150582 (0x555dd8a89582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x555dd8a6e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #33: _PyEval_EvalFrameDefault + 0x13ca (0x55d7e1a9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #34: + 0x150582 (0x55d7e1ab9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #59: + 0x150582 (0x555dd8a89582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #60: PyObject_Call + 0xbc (0x555dd8a89f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #38: + 0x211239 (0x5607aa244239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: frame #63: PyObject_Call + 0xbc (0x55fa0b18ff1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default4]:[rank52]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default3]:[rank51]: frame #35: _PyEval_EvalFrameDefault + 0x13ca (0x55d7e1a9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #36: _PyObject_FastCallDictTstate + 0xd0 (0x55d7e1aa5f50 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #39: _PyObject_MakeTpCall + 0x26b (0x5607aa170a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x5607aa16c3e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #37: _PyObject_Call_Prepend + 0x69 (0x55d7e1ab7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x555dd8a702b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #62: + 0x150582 (0x555dd8a89582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #38: + 0x211239 (0x55d7e1b7a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #39: _PyObject_MakeTpCall + 0x26b (0x55d7e1aa6a6b in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #41: _PyFunction_Vectorcall + 0x6c (0x5607aa177a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #40: _PyEval_EvalFrameDefault + 0x4eb6 (0x55d7e1aa23e6 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #41: _PyFunction_Vectorcall + 0x6c (0x55d7e1aada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x55d7e1a9dc5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #42: _PyEval_EvalFrameDefault + 0x72c (0x5607aa167c5c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: frame #63: PyObject_Call + 0xbc (0x555dd8a89f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #43: _PyFunction_Vectorcall + 0x6c (0x55d7e1aada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default0]:[rank48]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank53]: frame #43: _PyFunction_Vectorcall + 0x6c (0x5607aa177a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x5607aa1688fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #44: _PyEval_EvalFrameDefault + 0x13ca (0x55d7e1a9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #45: + 0x150582 (0x55d7e1ab9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #45: + 0x150582 (0x5607aa183582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #46: PyObject_Call + 0xbc (0x5607aa183f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x5607aa16a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #48: + 0x150582 (0x5607aa183582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #49: PyObject_Call + 0xbc (0x5607aa183f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x5607aa16a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #46: PyObject_Call + 0xbc (0x55d7e1ab9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #47: _PyEval_EvalFrameDefault + 0x2d83 (0x55d7e1aa02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #48: + 0x150582 (0x55d7e1ab9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #49: PyObject_Call + 0xbc (0x55d7e1ab9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #50: _PyEval_EvalFrameDefault + 0x2d83 (0x55d7e1aa02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #51: _PyFunction_Vectorcall + 0x6c (0x55d7e1aada2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x55d7e1aa6007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #53: _PyObject_Call_Prepend + 0x69 (0x55d7e1ab7c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #54: + 0x211239 (0x55d7e1b7a239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #55: PyObject_Call + 0x207 (0x55d7e1aba067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x55d7e1aa02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #57: + 0x150582 (0x55d7e1ab9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x55d7e1a9e8fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #59: + 0x150582 (0x55d7e1ab9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #60: PyObject_Call + 0xbc (0x55d7e1ab9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x55d7e1aa02b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #62: + 0x150582 (0x55d7e1ab9582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: frame #63: PyObject_Call + 0xbc (0x55d7e1ab9f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default3]:[rank51]: . This may indicate a possible application crash on rank 0 or a network set up issue. -[default5]:[rank53]: frame #51: _PyFunction_Vectorcall + 0x6c (0x5607aa177a2c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #52: _PyObject_FastCallDictTstate + 0x187 (0x5607aa170007 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #53: _PyObject_Call_Prepend + 0x69 (0x5607aa181c39 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #54: + 0x211239 (0x5607aa244239 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #55: PyObject_Call + 0x207 (0x5607aa184067 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #56: _PyEval_EvalFrameDefault + 0x2d83 (0x5607aa16a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #57: + 0x150582 (0x5607aa183582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #58: _PyEval_EvalFrameDefault + 0x13ca (0x5607aa1688fa in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #59: + 0x150582 (0x5607aa183582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #60: PyObject_Call + 0xbc (0x5607aa183f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #61: _PyEval_EvalFrameDefault + 0x2d83 (0x5607aa16a2b3 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #62: + 0x150582 (0x5607aa183582 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: frame #63: PyObject_Call + 0xbc (0x5607aa183f1c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10) -[default5]:[rank53]: . This may indicate a possible application crash on rank 0 or a network set up issue. -W0703 00:25:51.595000 139640441337664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1768197 closing signal SIGTERM -E0703 00:25:51.919000 139640441337664 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1768194) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:25:51 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1768195) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:25:51 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1768196) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:25:51 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1768198) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_00:25:51 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1768199) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_00:25:51 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1768200) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_00:25:51 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1768201) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:25:51 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1768194) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -W0703 00:25:55.688000 140127802308352 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3883965_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.411000 139726817724160 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-153.ec2.internal_1410911_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.437000 140167677568768 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3755077_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.492000 140616618817280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_1132214_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.500000 140605930690304 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-162-233.ec2.internal_1390178_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.561000 139853893625600 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_872968_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.566000 139632555181824 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-103.ec2.internal_859536_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.606000 140133463041856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3884035 closing signal SIGTERM -W0703 00:25:56.606000 140133463041856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3884036 closing signal SIGTERM -W0703 00:25:56.606000 140133463041856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3884037 closing signal SIGTERM -W0703 00:25:56.607000 140133463041856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3884038 closing signal SIGTERM -W0703 00:25:56.608000 140133463041856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3884039 closing signal SIGTERM -W0703 00:25:56.610000 140133463041856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3884040 closing signal SIGTERM -W0703 00:25:56.610000 140133463041856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3884041 closing signal SIGTERM -W0703 00:25:56.610000 140133463041856 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3884042 closing signal SIGTERM -W0703 00:25:56.613000 140611591423808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1390247 closing signal SIGTERM -W0703 00:25:56.613000 140611591423808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1390248 closing signal SIGTERM -W0703 00:25:56.613000 140611591423808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1390249 closing signal SIGTERM -W0703 00:25:56.615000 140611591423808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1390250 closing signal SIGTERM -W0703 00:25:56.615000 140611591423808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1390251 closing signal SIGTERM -W0703 00:25:56.614000 140173338302272 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3755146 closing signal SIGTERM -W0703 00:25:56.615000 140173338302272 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3755147 closing signal SIGTERM -W0703 00:25:56.615000 140173338302272 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3755148 closing signal SIGTERM -W0703 00:25:56.615000 140611591423808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1390252 closing signal SIGTERM -W0703 00:25:56.616000 140611591423808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1390253 closing signal SIGTERM -W0703 00:25:56.616000 140173338302272 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3755149 closing signal SIGTERM -W0703 00:25:56.617000 140173338302272 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3755150 closing signal SIGTERM -W0703 00:25:56.618000 140611591423808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1390254 closing signal SIGTERM -W0703 00:25:56.617000 140173338302272 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3755151 closing signal SIGTERM -W0703 00:25:56.618000 140173338302272 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3755152 closing signal SIGTERM -W0703 00:25:56.619000 140173338302272 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3755153 closing signal SIGTERM -E0703 00:25:56.725000 140622279550784 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1132283) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:25:56.732000 140622279550784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1132214_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 00:25:56.732000 139732478457664 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1410979) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 00:25:56.731000 139638215915328 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 859606) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:25:56.738000 139732478457664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1410911_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.737000 139638215915328 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_859536_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 00:25:56.740000 139859554359104 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 873038) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 00:25:56.746000 139859554359104 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_872968_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.764000 139638215915328 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_859536_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.768000 140622279550784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1132214_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.773000 139859554359104 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_872968_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.776000 139732478457664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1410911_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:25:56.792000 139638215915328 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_859536_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -W0703 00:25:56.796000 140622279550784 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1132214_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 859607) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 859608) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 859609) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 859610) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 859611) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 859612) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 859613) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 859606) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1132284) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-78.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1132285) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1132286) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1132287) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1132288) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-78.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1132289) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1132290) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1132283) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -W0703 00:25:56.801000 139859554359104 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_872968_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:25:56 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 873039) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:25:56 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 873040) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:25:56 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 873041) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_00:25:56 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 873042) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_00:25:56 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 873043) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_00:25:56 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 873044) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_00:25:56 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 873045) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:25:56 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 873038) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -W0703 00:25:56.805000 139732478457664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1410911_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-153.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 1410980) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-153.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 1410981) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-153.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 1410982) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 1410983) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-153.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 1410984) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-153.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 1410985) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-153.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 1410986) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:25:56 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 1410979) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -W0703 00:26:00.242000 140173338302272 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3755077_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:26:00.253000 140173338302272 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3755077_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 00:26:00.336000 140611591423808 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1390178_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:26:00.348000 140611591423808 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-162-233.ec2.internal_1390178_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 00:26:00.442000 140133463041856 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3883965_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 00:26:00.454000 140133463041856 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3883965_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -srun: error: ip-26-0-162-233: task 4: Exited with exit code 1 -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-512/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/bench.slurm deleted file mode 100644 index 7bc175fefab7d955e474c0c8d8b6f81964db3f29..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/config.yaml deleted file mode 100644 index 9578f35975b267ba1a592fba2f36a94b8410d0cb..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 8 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 64 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/log.out deleted file mode 100644 index b6e70b9848cab7df926120ca02a49c4efa6f4134..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/log.out +++ /dev/null @@ -1,2556 +0,0 @@ -======================== -START TIME: Wed Jul 3 10:26:59 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 10:27:04.766000 139873746536256 torch/distributed/run.py:757] -W0703 10:27:04.766000 139873746536256 torch/distributed/run.py:757] ***************************************** -W0703 10:27:04.766000 139873746536256 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:27:04.766000 139873746536256 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.251000 140505158305600 torch/distributed/run.py:757] -W0703 10:27:05.251000 140505158305600 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.251000 140505158305600 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:27:05.251000 140505158305600 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.265000 140418339886912 torch/distributed/run.py:757] -W0703 10:27:05.265000 140418339886912 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.265000 140418339886912 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:27:05.265000 140418339886912 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.437000 140408010262336 torch/distributed/run.py:757] -W0703 10:27:05.437000 140408010262336 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.437000 140408010262336 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:27:05.437000 140408010262336 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.785000 140237037512512 torch/distributed/run.py:757] -W0703 10:27:05.785000 140237037512512 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.785000 140237037512512 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:27:05.785000 140237037512512 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.832000 140678480672576 torch/distributed/run.py:757] -W0703 10:27:05.832000 140678480672576 torch/distributed/run.py:757] ***************************************** -W0703 10:27:05.832000 140678480672576 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:27:05.832000 140678480672576 torch/distributed/run.py:757] ***************************************** -W0703 10:27:06.057000 140698347431744 torch/distributed/run.py:757] -W0703 10:27:06.057000 140698347431744 torch/distributed/run.py:757] ***************************************** -W0703 10:27:06.057000 140698347431744 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:27:06.057000 140698347431744 torch/distributed/run.py:757] ***************************************** -W0703 10:27:06.360000 140578875418432 torch/distributed/run.py:757] -W0703 10:27:06.360000 140578875418432 torch/distributed/run.py:757] ***************************************** -W0703 10:27:06.360000 140578875418432 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:27:06.360000 140578875418432 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 10:27:31 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Config: -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: run='%date_%jobid', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: seed=42, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: step=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: consumed_train_samples=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: benchmark_csv_path=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: ignore_sanity_checks=True), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pp=2, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp=16, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pp_engine=, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp_mode=, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp_linear_async_communication=False, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: expert_parallel_size=1), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: eos_token_id=2, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_act='silu', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_size=2048, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: initializer_range=0.02, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: intermediate_size=4096, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: is_llama_config=True, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: max_position_embeddings=4096, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_attention_heads=32, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_hidden_layers=24, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_key_value_heads=32, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pad_token_id=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pretraining_tp=1, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rms_norm_eps=1e-05, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_scaling=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_theta=10000.0, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tie_word_embeddings=True, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: use_cache=True, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: vocab_size=50272), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dtype=torch.bfloat16, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer_revision=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer_max_length=None), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoint_interval=100000, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: save_initial_state=False, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: resume_checkpoint_path=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: log_level_replica='info', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: iteration_step_info_interval=1), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: train_steps=20, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: micro_batch_size=64, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: batch_accumulation_per_replica=8, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: val_check_interval=-1, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: limit_val_batches=0, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: limit_test_batches=0), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: adam_beta1=0.9, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: adam_beta2=0.95, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: torch_adam_is_fused=True, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: name='adamW'), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: zero_stage=1, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: weight_decay=0.01, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: clip_grad=1.0, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_warmup_steps=1, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_warmup_style='linear', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_style='linear', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_steps=19, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_starting_step=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: min_decay_lr=1e-05)), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: start_training_step=1, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hf_dataset_splits='train', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hf_dataset_config_name=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dataset_overwrite_cache=False, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: text_column_name='text'), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: seed=42, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_loading_workers=0))], -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64')), -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lighteval=None) -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Model Config: -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: eos_token_id=2, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_act='silu', -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_size=2048, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: initializer_range=0.02, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: intermediate_size=4096, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: is_llama_config=True, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: max_position_embeddings=4096, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_attention_heads=32, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_hidden_layers=24, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_key_value_heads=32, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pad_token_id=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pretraining_tp=1, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rms_norm_eps=1e-05, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_scaling=None, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_theta=10000.0, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tie_word_embeddings=True, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: use_cache=True, -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: vocab_size=50272) -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Building model.. -[default0]:07/03/2024 10:27:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Setting PP block ranks... -[default7]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=7|ip-26-0-169-239]: No checkpoint path provided. -[default1]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=1|ip-26-0-169-239]: No checkpoint path provided. -[default5]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=5|ip-26-0-169-239]: No checkpoint path provided. -[default4]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=4|ip-26-0-169-239]: No checkpoint path provided. -[default6]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=6|ip-26-0-169-239]: No checkpoint path provided. -[default3]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=3|ip-26-0-169-239]: No checkpoint path provided. -[default0]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=0|ip-26-0-169-239]: No checkpoint path provided. -[default2]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=2|ip-26-0-169-239]: No checkpoint path provided. -[default4]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=12|ip-26-0-169-247]: No checkpoint path provided. -[default0]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=8|ip-26-0-169-247]: No checkpoint path provided. -[default7]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=15|ip-26-0-169-247]: No checkpoint path provided. -[default1]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=11|ip-26-0-169-247]: No checkpoint path provided. -[default6]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=10|ip-26-0-169-247]: No checkpoint path provided. -[default5]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=13|ip-26-0-169-247]: No checkpoint path provided. -[default6]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=14|ip-26-0-169-247]: No checkpoint path provided. -[default1]:07/03/2024 10:27:49 [INFO|DP=1|PP=0|TP=9|ip-26-0-169-247]: No checkpoint path provided. -[default5]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 10:27:49 [INFO|DP=1|PP=1|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-56]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-56]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-56]: No checkpoint path provided. -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-56]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-56]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-56]: No checkpoint path provided. -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: No checkpoint path provided. -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: No checkpoint path provided. -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-56]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-56]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-56]: No checkpoint path provided. -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: No checkpoint path provided. -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: No checkpoint path provided. -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: No checkpoint path provided. -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-56]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-56]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-56]: No checkpoint path provided. -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: No checkpoint path provided. -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: No checkpoint path provided. -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-56]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-56]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-56]: No checkpoint path provided. -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-56]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-56]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-56]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-56]: No checkpoint path provided. -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-56]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-56]: No checkpoint path provided. -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: No checkpoint path provided. -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-56]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-56]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-56]: No checkpoint path provided. -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=6|ip-26-0-170-31]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=6|ip-26-0-170-31]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=6|ip-26-0-170-31]: No checkpoint path provided. -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=5|ip-26-0-170-31]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=5|ip-26-0-170-31]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=5|ip-26-0-170-31]: No checkpoint path provided. -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: No checkpoint path provided. -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: No checkpoint path provided. -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=7|ip-26-0-170-31]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=7|ip-26-0-170-31]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=7|ip-26-0-170-31]: No checkpoint path provided. -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: No checkpoint path provided. -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: No checkpoint path provided. -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=4|ip-26-0-170-31]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=4|ip-26-0-170-31]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: No checkpoint path provided. -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=4|ip-26-0-170-31]: No checkpoint path provided. -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: No checkpoint path provided. -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: No checkpoint path provided. -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 10:27:50 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: No checkpoint path provided. -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Total number of parameters: 1.21G (2315.81MiB) -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: No checkpoint path provided. -[default0]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Parametrizing model parameters using StandardParametrizator -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: No checkpoint path provided. -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: No checkpoint path provided. -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 10:27:50 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: No checkpoint path provided. -[default0]:07/03/2024 10:27:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 10:27:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 10:27:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 10:27:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 10:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 10:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Using `datasets` library -[default0]:07/03/2024 10:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 10:27:53 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:27:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 10:27:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 10:27:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: -[default0]:07/03/2024 10:27:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Start training] datetime: 2024-07-03 10:27:55.469581 | mbs: 64 | grad_accum: 8 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 10:27:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 10:27:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default0]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=5|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=1|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=4|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=7|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=14|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=6|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=2|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=3|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=0|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=12|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=8|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=15|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=6|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=3|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=7|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=12|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=15|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=8|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=2|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=4|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=3|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=4|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=0|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=2|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=11|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=5|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=10|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=13|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=9|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:27:55 [WARNING|DP=1|PP=0|TP=14|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=1|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=6|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=9|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=10|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=5|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=7|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:27:55 [WARNING|DP=1|PP=1|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=13|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:27:55 [WARNING|DP=0|PP=1|TP=1|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:27:55 [WARNING|DP=0|PP=0|TP=11|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:27:56 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 195.94 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: output = self.o_proj(attention_output) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 107.94 MiB is free. Including non-PyTorch memory, this process has 79.21 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: output = model(**micro_batch) -[default0]:[rank8]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: sharded_logits = self.model( -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 107.94 MiB is free. Including non-PyTorch memory, this process has 79.21 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: output = self.o_proj(attention_output) -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 195.94 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: output = self.o_proj(attention_output) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 195.94 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 195.94 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank10]: output = self.o_proj(attention_output) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 107.94 MiB is free. Including non-PyTorch memory, this process has 79.21 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: output = self.o_proj(attention_output) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 795.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: Traceback (most recent call last): -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 795.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: output = self.o_proj(attention_output) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default7]:[rank7]: output = self.o_proj(attention_output) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default3]:[rank3]: return row_linear( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 707.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 707.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 795.94 MiB is free. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 707.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 707.94 MiB is free. Including non-PyTorch memory, this process has 78.63 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: sharded_logits = self.model( -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: output = model(**micro_batch) -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: sharded_logits = self.model( -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default0]:[rank16]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default1]:[rank17]: output = self.o_proj(attention_output) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 107.94 MiB is free. Including non-PyTorch memory, this process has 79.21 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank29]: output = self.o_proj(attention_output) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return row_linear( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 155.94 MiB is free. Including non-PyTorch memory, this process has 79.17 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank30]: output = self.o_proj(attention_output) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 51.94 MiB is free. Including non-PyTorch memory, this process has 79.27 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 56.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: output = self.o_proj(attention_output) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: output = self.o_proj(attention_output) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 195.94 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 107.94 MiB is free. Including non-PyTorch memory, this process has 79.21 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank16]: output = self.o_proj(attention_output) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: return row_linear( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank18]: output = self.o_proj(attention_output) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 195.94 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank21]: output = self.o_proj(attention_output) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 107.94 MiB is free. Including non-PyTorch memory, this process has 79.21 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank22]: output = self.o_proj(attention_output) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 195.94 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank23]: output = self.o_proj(attention_output) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 107.94 MiB is free. Including non-PyTorch memory, this process has 79.21 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: output = self.o_proj(attention_output) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 155.94 MiB is free. Including non-PyTorch memory, this process has 79.17 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank27]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: sharded_logits = self.model( -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: output = self.o_proj(attention_output) -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: trainer.train(dataloader) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 155.94 MiB is free. Including non-PyTorch memory, this process has 79.17 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -[default2]:[rank26]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: output = self.o_proj(attention_output) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 51.94 MiB is free. Including non-PyTorch memory, this process has 79.27 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 56.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: Traceback (most recent call last): -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: output = self.o_proj(attention_output) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: return row_linear( -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default4]:[rank28]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 155.94 MiB is free. Including non-PyTorch memory, this process has 79.17 GiB memory in use. Of the allocated memory 69.56 GiB is allocated by PyTorch, and 120.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 115.94 MiB is free. Including non-PyTorch memory, this process has 79.21 GiB memory in use. Of the allocated memory 69.43 GiB is allocated by PyTorch, and 122.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -W0703 10:28:17.610000 140237037512512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 576247 closing signal SIGTERM -W0703 10:28:17.610000 140237037512512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 576248 closing signal SIGTERM -W0703 10:28:17.610000 140237037512512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 576249 closing signal SIGTERM -W0703 10:28:17.610000 140237037512512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 576251 closing signal SIGTERM -W0703 10:28:17.611000 140237037512512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 576252 closing signal SIGTERM -W0703 10:28:17.611000 140237037512512 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 576253 closing signal SIGTERM -E0703 10:28:18.938000 140237037512512 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 576246) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:28:17 - host : ip-26-0-169-139.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 576250) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:28:17 - host : ip-26-0-169-139.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 576246) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-169-139: task 0: Exited with exit code 1 -W0703 10:28:21.721000 140402349528832 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-207.ec2.internal_2593355_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:22.045000 139868085802752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-239.ec2.internal_2564739_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:22.174000 140672819939072 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-247.ec2.internal_45071_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:22.348000 140692686698240 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-56.ec2.internal_3437950_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:22.499000 140412679153408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3985502_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:22.541000 140573214684928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-170-31.ec2.internal_3105317_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:22.580000 140499497572096 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_973370_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:22.614000 140698347431744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3438025 closing signal SIGTERM -W0703 10:28:22.614000 140698347431744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3438026 closing signal SIGTERM -W0703 10:28:22.614000 140698347431744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3438027 closing signal SIGTERM -W0703 10:28:22.614000 140698347431744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3438028 closing signal SIGTERM -W0703 10:28:22.615000 140698347431744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3438029 closing signal SIGTERM -W0703 10:28:22.616000 140698347431744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3438030 closing signal SIGTERM -W0703 10:28:22.615000 139873746536256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2564812 closing signal SIGTERM -W0703 10:28:22.616000 139873746536256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2564813 closing signal SIGTERM -W0703 10:28:22.616000 139873746536256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2564814 closing signal SIGTERM -W0703 10:28:22.616000 139873746536256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2564815 closing signal SIGTERM -W0703 10:28:22.616000 139873746536256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2564816 closing signal SIGTERM -W0703 10:28:22.616000 139873746536256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2564817 closing signal SIGTERM -W0703 10:28:22.616000 139873746536256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2564818 closing signal SIGTERM -W0703 10:28:22.616000 139873746536256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2564819 closing signal SIGTERM -W0703 10:28:22.617000 140698347431744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3438031 closing signal SIGTERM -W0703 10:28:22.617000 140698347431744 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3438032 closing signal SIGTERM -W0703 10:28:22.616000 140578875418432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3105392 closing signal SIGTERM -W0703 10:28:22.617000 140578875418432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3105393 closing signal SIGTERM -W0703 10:28:22.617000 140578875418432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3105394 closing signal SIGTERM -W0703 10:28:22.617000 140578875418432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3105395 closing signal SIGTERM -W0703 10:28:22.617000 140578875418432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3105396 closing signal SIGTERM -W0703 10:28:22.620000 140678480672576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45144 closing signal SIGTERM -W0703 10:28:22.620000 140678480672576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45145 closing signal SIGTERM -W0703 10:28:22.620000 140678480672576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45146 closing signal SIGTERM -W0703 10:28:22.620000 140678480672576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45147 closing signal SIGTERM -W0703 10:28:22.620000 140678480672576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45148 closing signal SIGTERM -W0703 10:28:22.620000 140678480672576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45149 closing signal SIGTERM -W0703 10:28:22.621000 140678480672576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45150 closing signal SIGTERM -W0703 10:28:22.621000 140678480672576 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 45151 closing signal SIGTERM -W0703 10:28:22.621000 140578875418432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3105397 closing signal SIGTERM -W0703 10:28:22.622000 140418339886912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3985575 closing signal SIGTERM -W0703 10:28:22.622000 140578875418432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3105398 closing signal SIGTERM -W0703 10:28:22.622000 140418339886912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3985576 closing signal SIGTERM -W0703 10:28:22.623000 140505158305600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 973444 closing signal SIGTERM -W0703 10:28:22.623000 140505158305600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 973445 closing signal SIGTERM -W0703 10:28:22.622000 140418339886912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3985577 closing signal SIGTERM -W0703 10:28:22.623000 140505158305600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 973446 closing signal SIGTERM -W0703 10:28:22.623000 140418339886912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3985578 closing signal SIGTERM -W0703 10:28:22.623000 140578875418432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3105399 closing signal SIGTERM -W0703 10:28:22.625000 140505158305600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 973447 closing signal SIGTERM -W0703 10:28:22.625000 140505158305600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 973448 closing signal SIGTERM -W0703 10:28:22.625000 140505158305600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 973449 closing signal SIGTERM -W0703 10:28:22.625000 140418339886912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3985579 closing signal SIGTERM -W0703 10:28:22.625000 140418339886912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3985580 closing signal SIGTERM -W0703 10:28:22.625000 140418339886912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3985581 closing signal SIGTERM -W0703 10:28:22.627000 140418339886912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3985582 closing signal SIGTERM -W0703 10:28:22.628000 140505158305600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 973450 closing signal SIGTERM -W0703 10:28:22.628000 140505158305600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 973451 closing signal SIGTERM -E0703 10:28:22.737000 140408010262336 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2593430) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:28:22.743000 140408010262336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2593355_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:22.770000 140408010262336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2593355_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:22.798000 140408010262336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2593355_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:28:22 - host : ip-26-0-169-207.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 2593431) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_10:28:22 - host : ip-26-0-169-207.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 2593432) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_10:28:22 - host : ip-26-0-169-207.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 2593433) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_10:28:22 - host : ip-26-0-169-207.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 2593434) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_10:28:22 - host : ip-26-0-169-207.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 2593435) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_10:28:22 - host : ip-26-0-169-207.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 2593436) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_10:28:22 - host : ip-26-0-169-207.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 2593437) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:28:22 - host : ip-26-0-169-207.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 2593430) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-169-207: task 1: Exited with exit code 1 -W0703 10:28:27.049000 139868085802752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-239.ec2.internal_2564739_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:27.179000 140672819939072 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-247.ec2.internal_45071_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:27.352000 140692686698240 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-56.ec2.internal_3437950_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:27.503000 140412679153408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3985502_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:27.546000 140573214684928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-170-31.ec2.internal_3105317_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:27.585000 140499497572096 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_973370_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:27.654000 140678480672576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_45071_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:27.664000 140678480672576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_45071_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-169-247: task 3: Exited with exit code 1 -W0703 10:28:28.049000 139873746536256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2564739_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:28.062000 139873746536256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2564739_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-169-239: task 2: Exited with exit code 1 -W0703 10:28:31.560000 140505158305600 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_973370_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:31.570000 140505158305600 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_973370_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-88: task 7: Exited with exit code 1 -W0703 10:28:32.356000 140692686698240 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-56.ec2.internal_3437950_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:32.508000 140412679153408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3985502_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:32.551000 140573214684928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-170-31.ec2.internal_3105317_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:35.165000 140418339886912 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3985502_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:35.176000 140418339886912 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3985502_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 10:28:35.359000 140698347431744 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-56.ec2.internal_3437950_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:35.373000 140698347431744 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-56.ec2.internal_3437950_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-62: task 6: Exited with exit code 1 -srun: error: ip-26-0-171-56: task 5: Exited with exit code 1 -W0703 10:28:35.763000 140578875418432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-170-31.ec2.internal_3105317_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:28:35.785000 140578875418432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-170-31.ec2.internal_3105317_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-170-31: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-64/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/bench.slurm deleted file mode 100644 index aaf628e6d81cbeef0d3f7038617126049af5512a..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8 llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/config.yaml b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/config.yaml deleted file mode 100644 index bda6c3cf5879b328fc0988b367f4de59d3b86659..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 2 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 64 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 8 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/log.out b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/log.out deleted file mode 100644 index 70bcf734944155879fc55ff5343859721cfaae5e..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/log.out +++ /dev/null @@ -1,5801 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:51:09 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:51:12.079000 140394431031104 torch/distributed/run.py:757] -W0703 03:51:12.079000 140394431031104 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.079000 140394431031104 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:51:12.079000 140394431031104 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.111000 139961753220928 torch/distributed/run.py:757] -W0703 03:51:12.111000 139961753220928 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.111000 139961753220928 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:51:12.111000 139961753220928 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.111000 140400945293120 torch/distributed/run.py:757] -W0703 03:51:12.111000 140400945293120 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.111000 140400945293120 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:51:12.111000 140400945293120 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.116000 140659577317184 torch/distributed/run.py:757] -W0703 03:51:12.116000 140659577317184 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.116000 140659577317184 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:51:12.116000 140659577317184 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.122000 139907187222336 torch/distributed/run.py:757] -W0703 03:51:12.122000 139907187222336 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.122000 139907187222336 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:51:12.122000 139907187222336 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.124000 140651166279488 torch/distributed/run.py:757] -W0703 03:51:12.124000 140651166279488 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.124000 140651166279488 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:51:12.124000 140651166279488 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.142000 140490932283200 torch/distributed/run.py:757] -W0703 03:51:12.142000 140490932283200 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.142000 140490932283200 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:51:12.142000 140490932283200 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.144000 139800098445120 torch/distributed/run.py:757] -W0703 03:51:12.144000 139800098445120 torch/distributed/run.py:757] ***************************************** -W0703 03:51:12.144000 139800098445120 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:51:12.144000 139800098445120 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:51:32 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=2, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=8, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=64, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8')), -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 03:51:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=3|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=6|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=2|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=10|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=11|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=7|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=13|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=5|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=4|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=1|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=7|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=9|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-78]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-78]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=0|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 43.2M (82.38MiB) -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=6|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=5|ip-26-0-161-138]: No checkpoint path provided. -[default4]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=8|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.21G (2315.81MiB) -[default3]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default5]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=15|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default4]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=12|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default4]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=4|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 43.2M (82.38MiB) -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 98.13MiB. Peak allocated: 100.16MiB Peak reserved: 112.00MiB -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default6]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 03:51:49 [INFO|DP=1|PP=0|TP=14|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default7]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default6]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: Local number of parameters: 32.7M (62.36MiB) -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: [After model building] Memory usage: 73.37MiB. Peak allocated: 75.40MiB Peak reserved: 82.00MiB -[default1]:07/03/2024 03:51:49 [INFO|DP=0|PP=1|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 03:51:49 [INFO|DP=1|PP=1|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 03:51:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:51:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:51:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 03:51:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 21.6M out of 43.2M (50.00%) params' optimizer states -[default0]:07/03/2024 03:51:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:51:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 03:51:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 03:51:52 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:51:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:51:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:51:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 03:51:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 03:51:53.142838 | mbs: 8 | grad_accum: 64 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:51:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:51:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 345.27MiB. Peak allocated 345.27MiB. Peak reserved: 362.00MiB -[default3]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=3|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=6|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=2|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=7|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=13|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=5|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=1|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=0|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=6|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=5|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=8|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=4|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=12|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=15|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=4|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=9|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=14|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:51:53 [WARNING|DP=1|PP=1|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:51:53 [WARNING|DP=0|PP=1|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:51:53 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:51:53 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank27]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank27]: grad_accumulator.backward(sum(activations)) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank27]: result = loss.backward() -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank27]: torch.autograd.backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank27]: _engine_run_backward( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank27]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank27]: return user_fn(self, *args) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank27]: pipeline_state.run_communication() -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank27]: self.grads_buffer.append(recv_grad()) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank27]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank27]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank27]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank27]: dist.recv( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank27]: return func(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank27]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank27]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank26]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank26]: grad_accumulator.backward(sum(activations)) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank26]: result = loss.backward() -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank26]: torch.autograd.backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank26]: _engine_run_backward( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank26]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank26]: return user_fn(self, *args) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank26]: pipeline_state.run_communication() -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank26]: self.grads_buffer.append(recv_grad()) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank26]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank26]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank26]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank26]: dist.recv( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank26]: return func(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank26]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank26]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank22]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank22]: grad_accumulator.backward(sum(activations)) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank22]: result = loss.backward() -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank22]: torch.autograd.backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank22]: _engine_run_backward( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank22]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank22]: return user_fn(self, *args) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank22]: pipeline_state.run_communication() -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank22]: self.grads_buffer.append(recv_grad()) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank22]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank22]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank22]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank22]: dist.recv( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank22]: return func(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank22]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank22]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank21]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank21]: grad_accumulator.backward(sum(activations)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank21]: result = loss.backward() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank21]: torch.autograd.backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank21]: _engine_run_backward( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank21]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank21]: return user_fn(self, *args) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank21]: pipeline_state.run_communication() -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank21]: self.grads_buffer.append(recv_grad()) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank21]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank21]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank21]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank21]: dist.recv( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank21]: return func(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank21]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank21]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank16]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank16]: grad_accumulator.backward(sum(activations)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank16]: result = loss.backward() -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank16]: torch.autograd.backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank16]: _engine_run_backward( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank16]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank16]: return user_fn(self, *args) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank16]: pipeline_state.run_communication() -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank16]: self.grads_buffer.append(recv_grad()) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank16]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank16]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank16]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank16]: dist.recv( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank16]: return func(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank16]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank16]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank29]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank29]: grad_accumulator.backward(sum(activations)) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank29]: result = loss.backward() -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank29]: torch.autograd.backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank29]: _engine_run_backward( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank29]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank29]: return user_fn(self, *args) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank29]: pipeline_state.run_communication() -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank29]: self.grads_buffer.append(recv_grad()) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank29]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank29]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank29]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank29]: dist.recv( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank29]: return func(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank29]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank29]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank17]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank17]: grad_accumulator.backward(sum(activations)) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank17]: result = loss.backward() -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank17]: torch.autograd.backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank17]: _engine_run_backward( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank17]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank17]: return user_fn(self, *args) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank17]: pipeline_state.run_communication() -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank17]: self.grads_buffer.append(recv_grad()) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank17]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank17]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank17]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank17]: dist.recv( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank17]: return func(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank17]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank17]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank18]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank18]: grad_accumulator.backward(sum(activations)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank18]: result = loss.backward() -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank18]: torch.autograd.backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank18]: _engine_run_backward( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank18]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank18]: return user_fn(self, *args) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank18]: pipeline_state.run_communication() -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank18]: self.grads_buffer.append(recv_grad()) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank18]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank18]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank18]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank18]: dist.recv( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank18]: return func(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank18]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank18]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank14]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank14]: grad_accumulator.backward(sum(activations)) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank14]: result = loss.backward() -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank14]: torch.autograd.backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank14]: _engine_run_backward( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank14]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank14]: return user_fn(self, *args) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank14]: pipeline_state.run_communication() -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank14]: self.grads_buffer.append(recv_grad()) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank14]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank14]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank14]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank14]: dist.recv( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank14]: return func(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank14]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank14]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank15]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank15]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank15]: result = loss.backward() -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank13]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank13]: grad_accumulator.backward(sum(activations)) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank13]: result = loss.backward() -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank13]: torch.autograd.backward( -[default7]:[rank15]: torch.autograd.backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank15]: _engine_run_backward( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank13]: _engine_run_backward( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank13]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: return user_fn(self, *args) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank13]: pipeline_state.run_communication() -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: self.grads_buffer.append(recv_grad()) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank13]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank13]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank13]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank13]: dist.recv( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank13]: return func(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: return user_fn(self, *args) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank15]: pipeline_state.run_communication() -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank13]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank13]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank15]: self.grads_buffer.append(recv_grad()) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank15]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank15]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank15]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank15]: dist.recv( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank15]: return func(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank15]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank15]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank31]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank31]: grad_accumulator.backward(sum(activations)) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank31]: result = loss.backward() -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank31]: torch.autograd.backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank31]: _engine_run_backward( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank31]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank31]: return user_fn(self, *args) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank31]: pipeline_state.run_communication() -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank31]: self.grads_buffer.append(recv_grad()) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank31]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank31]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank31]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank31]: dist.recv( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank31]: return func(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank31]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank31]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank58]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank58]: pipeline_state.run_communication() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank58]: recv_activation_tensor = recv_activation() -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank58]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank58]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank58]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank58]: dist.recv( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank58]: return func(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank58]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank58]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank0]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank0]: grad_accumulator.backward(sum(activations)) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank0]: result = loss.backward() -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank0]: torch.autograd.backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank0]: _engine_run_backward( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank0]: return user_fn(self, *args) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank0]: pipeline_state.run_communication() -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank0]: self.grads_buffer.append(recv_grad()) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank0]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank0]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank0]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank0]: dist.recv( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank0]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank1]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank1]: grad_accumulator.backward(sum(activations)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank1]: result = loss.backward() -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank1]: torch.autograd.backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank1]: _engine_run_backward( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank1]: return user_fn(self, *args) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank1]: pipeline_state.run_communication() -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank1]: self.grads_buffer.append(recv_grad()) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank1]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank1]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank1]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank1]: dist.recv( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank1]: return func(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank1]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank1]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank50]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank50]: pipeline_state.run_communication() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank50]: recv_activation_tensor = recv_activation() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank50]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank50]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank50]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank50]: dist.recv( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank50]: return func(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank50]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank50]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank55]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank55]: pipeline_state.run_communication() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank55]: recv_activation_tensor = recv_activation() -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank55]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank55]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank55]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank55]: dist.recv( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank55]: return func(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank55]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank55]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank43]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank43]: pipeline_state.run_communication() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank43]: recv_activation_tensor = recv_activation() -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank43]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank43]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank43]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank43]: dist.recv( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank43]: return func(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank43]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank43]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank61]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank61]: pipeline_state.run_communication() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank61]: recv_activation_tensor = recv_activation() -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank61]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank61]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank61]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank61]: dist.recv( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank61]: return func(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank61]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank61]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank2]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank2]: grad_accumulator.backward(sum(activations)) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank2]: result = loss.backward() -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank2]: torch.autograd.backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank2]: _engine_run_backward( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank2]: return user_fn(self, *args) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank2]: pipeline_state.run_communication() -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank2]: self.grads_buffer.append(recv_grad()) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank2]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank2]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank2]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank2]: dist.recv( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank2]: return func(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank2]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank2]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank6]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default6]:[rank6]: grad_accumulator.backward(sum(activations)) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank6]: result = loss.backward() -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default6]:[rank6]: torch.autograd.backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank6]: _engine_run_backward( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank6]: return user_fn(self, *args) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank6]: pipeline_state.run_communication() -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank6]: self.grads_buffer.append(recv_grad()) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank6]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank6]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank6]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank6]: dist.recv( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank6]: return func(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank6]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank6]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank20]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank20]: grad_accumulator.backward(sum(activations)) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank20]: result = loss.backward() -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank20]: torch.autograd.backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank20]: _engine_run_backward( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank20]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank20]: return user_fn(self, *args) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank20]: pipeline_state.run_communication() -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank20]: self.grads_buffer.append(recv_grad()) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank20]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank20]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank20]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank20]: dist.recv( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank20]: return func(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank20]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank20]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default2]:[rank10]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:[rank10]: grad_accumulator.backward(sum(activations)) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default2]:[rank10]: result = loss.backward() -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]:[rank10]: torch.autograd.backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default2]:[rank10]: _engine_run_backward( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default2]:[rank10]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default2]:[rank10]: return user_fn(self, *args) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default2]:[rank10]: pipeline_state.run_communication() -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default2]:[rank10]: self.grads_buffer.append(recv_grad()) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default2]:[rank10]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank10]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank10]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank10]: dist.recv( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank10]: return func(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank10]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8cc2be0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8cc3eb9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8cc3ebea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8cc3ebfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f8d0f958e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f8d1499f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f8d1476a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8cc2be0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8cc3eb9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8cc3ebea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8cc3ebfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f8d0f958e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f8d1499f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f8d1476a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8cc2be0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f8cc3b43119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f8d0f958e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f8d1499f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f8d1476a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank34]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank34]: pipeline_state.run_communication() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank34]: recv_activation_tensor = recv_activation() -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank34]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank34]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank34]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank34]: dist.recv( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank34]: return func(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank34]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank34]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank35]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank35]: pipeline_state.run_communication() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank35]: recv_activation_tensor = recv_activation() -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank35]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank35]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank35]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank35]: dist.recv( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank35]: return func(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank35]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank35]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default2]:[rank42]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default2]:[rank42]: pipeline_state.run_communication() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default2]:[rank42]: recv_activation_tensor = recv_activation() -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default2]:[rank42]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default2]:[rank42]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default2]:[rank42]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default2]:[rank42]: dist.recv( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default2]:[rank42]: return func(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default2]:[rank42]: pg.recv([tensor], group_src_rank, tag).wait() -[default2]:[rank42]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank59]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank59]: pipeline_state.run_communication() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank59]: recv_activation_tensor = recv_activation() -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank59]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank59]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank59]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank59]: dist.recv( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank59]: return func(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank59]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank59]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank8]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank8]: grad_accumulator.backward(sum(activations)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank8]: result = loss.backward() -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank8]: torch.autograd.backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank8]: _engine_run_backward( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank8]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank8]: return user_fn(self, *args) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank8]: pipeline_state.run_communication() -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank8]: self.grads_buffer.append(recv_grad()) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank8]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank8]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank8]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank8]: dist.recv( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank8]: return func(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank8]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank23]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank23]: grad_accumulator.backward(sum(activations)) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank23]: result = loss.backward() -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank23]: torch.autograd.backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank23]: _engine_run_backward( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank23]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank23]: return user_fn(self, *args) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank23]: pipeline_state.run_communication() -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank23]: self.grads_buffer.append(recv_grad()) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank23]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank23]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank23]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank23]: dist.recv( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank23]: return func(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank23]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank23]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63749ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6375c85c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank19]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank19]: grad_accumulator.backward(sum(activations)) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank19]: result = loss.backward() -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank19]: torch.autograd.backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank19]: _engine_run_backward( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank19]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank19]: return user_fn(self, *args) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank19]: pipeline_state.run_communication() -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank19]: self.grads_buffer.append(recv_grad()) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank19]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank19]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6375c8aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6375c8bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f63c1724e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f63c676b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f63c6536353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank19]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63749ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6375c85c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6375c8aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6375c8bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f63c1724e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f63c676b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f63c6536353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank19]: dist.recv( -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default7]:[rank7]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default7]:[rank7]: grad_accumulator.backward(sum(activations)) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default7]:[rank7]: result = loss.backward() -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default7]:[rank7]: torch.autograd.backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default7]:[rank7]: _engine_run_backward( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default7]:[rank7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default7]:[rank7]: return user_fn(self, *args) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default7]:[rank7]: pipeline_state.run_communication() -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default7]:[rank7]: self.grads_buffer.append(recv_grad()) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default7]:[rank7]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank7]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank7]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank7]: dist.recv( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank7]: return func(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank7]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank7]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank3]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank3]: grad_accumulator.backward(sum(activations)) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank3]: result = loss.backward() -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank3]: torch.autograd.backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank3]: _engine_run_backward( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank3]: return user_fn(self, *args) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank3]: pipeline_state.run_communication() -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank3]: self.grads_buffer.append(recv_grad()) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank3]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank3]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank3]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank3]: dist.recv( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank3]: return func(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank3]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank3]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank49]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank49]: pipeline_state.run_communication() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank49]: recv_activation_tensor = recv_activation() -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank49]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank49]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank49]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank49]: dist.recv( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank49]: return func(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank49]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank49]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63749ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank19]: return func(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:frame #1: + 0xe32119 (0x7f637590f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f63c1724e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f63c676b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f63c6536353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank19]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank19]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72faae9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f72fbdc2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f72fbdc7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f72fbdc8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7347861e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f734c8a8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f734c673353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72faae9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f72fbdc2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f72fbdc7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f72fbdc8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7347861e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f734c8a8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f734c673353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f72faae9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f72fba4c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f7347861e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f734c8a8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f734c673353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4bc5802897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4bc6adbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4bc6ae0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4bc6ae1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4c1257ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f4c175c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f4c1738c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4bc5802897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4bc6adbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4bc6ae0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4bc6ae1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f4c1257ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f4c175c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f4c1738c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4bc5802897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f4bc6765119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f4c1257ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f4c175c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f4c1738c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default0]:[rank24]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default0]:[rank24]: grad_accumulator.backward(sum(activations)) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default0]:[rank24]: result = loss.backward() -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default0]:[rank24]: torch.autograd.backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default0]:[rank24]: _engine_run_backward( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default0]:[rank24]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default0]:[rank24]: return user_fn(self, *args) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default0]:[rank24]: pipeline_state.run_communication() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default0]:[rank24]: self.grads_buffer.append(recv_grad()) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default0]:[rank24]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank24]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank24]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank24]: dist.recv( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank24]: return func(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank24]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank24]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank38]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank38]: pipeline_state.run_communication() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank38]: recv_activation_tensor = recv_activation() -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank38]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank38]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank38]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank38]: dist.recv( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank38]: return func(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank38]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank38]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank62]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank62]: pipeline_state.run_communication() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank62]: recv_activation_tensor = recv_activation() -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank62]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank62]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank62]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank62]: dist.recv( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank62]: return func(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank62]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank62]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f054ec77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f054ff50c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f054ff55a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f054ff56dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f059b9efe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f05a0a36609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f05a0801353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f054ec77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default6]:[rank30]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f054ff50c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f054ff55a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f054ff56dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f059b9efe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f05a0a36609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f05a0801353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f054ec77897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default2]:frame #1: + 0xe32119 (0x7f054fbda119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank30]: grad_accumulator.backward(sum(activations)) -[default2]:frame #2: + 0xd3e95 (0x7f059b9efe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f05a0a36609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f05a0801353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default6]:[rank30]: result = loss.backward() -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default2]: -[default6]:[rank30]: torch.autograd.backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default6]:[rank30]: _engine_run_backward( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default6]:[rank30]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default6]:[rank30]: return user_fn(self, *args) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default6]:[rank30]: pipeline_state.run_communication() -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default6]:[rank30]: self.grads_buffer.append(recv_grad()) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default6]:[rank30]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank30]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank30]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank30]: dist.recv( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank30]: return func(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank30]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank30]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank40]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank40]: pipeline_state.run_communication() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank40]: recv_activation_tensor = recv_activation() -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank40]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank40]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank40]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank40]: dist.recv( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank40]: return func(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank40]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank40]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87214b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f872278dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8722792a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8722793dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f876e22ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f8773273609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f877303e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87214b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f872278dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8722792a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8722793dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f876e22ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f8773273609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f877303e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f87214b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f8722417119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f876e22ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f8773273609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f877303e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank25]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank25]: grad_accumulator.backward(sum(activations)) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank25]: result = loss.backward() -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank25]: torch.autograd.backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank25]: _engine_run_backward( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank25]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank25]: return user_fn(self, *args) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank25]: pipeline_state.run_communication() -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank25]: self.grads_buffer.append(recv_grad()) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank25]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank25]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank25]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank25]: dist.recv( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank25]: return func(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank25]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank25]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank47]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank47]: pipeline_state.run_communication() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank47]: recv_activation_tensor = recv_activation() -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank47]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank47]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank47]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank47]: dist.recv( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank47]: return func(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank47]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank47]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f365559b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3656874c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3656879a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f365687adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f36a2313e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f36a735a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f36a7125353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600059 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f365559b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3656874c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3656879a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f365687adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f36a2313e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f36a735a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f36a7125353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f365559b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f36564fe119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f36a2313e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f36a735a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f36a7125353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7296cf1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7297fcac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7297fcfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7297fd0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f72e3a69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f72e8ab0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f72e887b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7296cf1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7297fcac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7297fcfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7297fd0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f72e3a69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f72e8ab0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f72e887b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7296cf1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f7297c54119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f72e3a69e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f72e8ab0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f72e887b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank28]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank28]: grad_accumulator.backward(sum(activations)) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank28]: result = loss.backward() -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank28]: torch.autograd.backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank28]: _engine_run_backward( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank28]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank28]: return user_fn(self, *args) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank28]: pipeline_state.run_communication() -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank28]: self.grads_buffer.append(recv_grad()) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank28]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank28]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank28]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank28]: dist.recv( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank28]: return func(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank28]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank28]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4813186897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f481445fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4814464a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4814465dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f485fefee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4864f45609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4864d10353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4813186897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f481445fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4814464a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4814465dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f485fefee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4864f45609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4864d10353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4813186897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f48140e9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f485fefee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f4864f45609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4864d10353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff51d071897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff51e34ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff51e34fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff51e350dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff569de9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff56ee30609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff56ebfb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff51d071897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff51e34ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff51e34fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff51e350dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff569de9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff56ee30609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff56ebfb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff51d071897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7ff51dfd4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7ff569de9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7ff56ee30609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7ff56ebfb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default3]:[rank51]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default3]:[rank51]: pipeline_state.run_communication() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default3]:[rank51]: recv_activation_tensor = recv_activation() -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default3]:[rank51]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank51]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank51]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank51]: dist.recv( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank51]: return func(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank51]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank51]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank39]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank39]: pipeline_state.run_communication() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank39]: recv_activation_tensor = recv_activation() -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank39]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank39]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank39]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank39]: dist.recv( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank39]: return func(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank39]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank39]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank46]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank46]: pipeline_state.run_communication() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank46]: recv_activation_tensor = recv_activation() -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank46]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank46]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank46]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank46]: dist.recv( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank46]: return func(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank46]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank46]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1bdc4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1bef27c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1bef2ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1bef2ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb20a9c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb20fa0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb20f7d8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1bdc4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1bef27c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1bef2ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1bef2ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fb20a9c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fb20fa0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fb20f7d8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1bdc4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fb1bebb1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fb20a9c6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fb20fa0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fb20f7d8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8300b84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8301e5dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8301e62a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8301e63dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f834d8fce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f8352943609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f835270e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8300b84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8301e5dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8301e62a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default4]:[rank52]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8301e63dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:frame #4: + 0xd3e95 (0x7f834d8fce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f8352943609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:frame #6: clone + 0x43 (0x7f835270e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8300b84897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f8301ae7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:frame #2: + 0xd3e95 (0x7f834d8fce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank53]: output = model(**micro_batch) -[default7]:frame #3: + 0x8609 (0x7f8352943609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:[rank52]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:frame #4: clone + 0x43 (0x7f835270e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: sharded_logits = self.model( -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank52]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank52]: pipeline_state.run_communication() -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank52]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank52]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank52]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank52]: dist.recv( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank52]: return func(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank52]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank53]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank53]: pipeline_state.run_communication() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank53]: recv_activation_tensor = recv_activation() -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank53]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank53]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank53]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank53]: dist.recv( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank53]: return func(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank53]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank53]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank33]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank33]: pipeline_state.run_communication() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank33]: recv_activation_tensor = recv_activation() -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank33]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank33]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank33]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank33]: dist.recv( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank33]: return func(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank33]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank33]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8767a10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8768ce9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8768ceea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8768cefdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f87b4788e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f87b97cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f87b959a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8767a10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8768ce9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8768ceea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8768cefdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f87b4788e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f87b97cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f87b959a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8767a10897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f8768973119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f87b4788e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f87b97cf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f87b959a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f73d44bc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f73d5795c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f73d579aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f73d579bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f7421234e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f742627b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f7426046353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f73d44bc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f73d5795c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f73d579aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f73d579bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f7421234e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f742627b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f7426046353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f73d44bc897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f73d541f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f7421234e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f742627b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f7426046353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8397142897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f839841bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8398420a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8398421dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f83e3ebae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f83e8f01609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f83e8ccc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8397142897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f839841bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8398420a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8398421dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f83e3ebae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f83e8f01609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f83e8ccc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8397142897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f83980a5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f83e3ebae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f83e8f01609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f83e8ccc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank48]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank48]: pipeline_state.run_communication() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank48]: recv_activation_tensor = recv_activation() -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank48]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank48]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank48]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank48]: dist.recv( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank48]: return func(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank48]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank48]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default7]:[rank63]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default7]:[rank63]: pipeline_state.run_communication() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default7]:[rank63]: recv_activation_tensor = recv_activation() -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default7]:[rank63]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default7]:[rank63]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default7]:[rank63]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default7]:[rank63]: dist.recv( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default7]:[rank63]: return func(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default7]:[rank63]: pg.recv([tensor], group_src_rank, tag).wait() -[default7]:[rank63]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06fd679897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f06fe952c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f06fe957a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f06fe958dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f074a3f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f074f438609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f074f203353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06fd679897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f06fe952c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f06fe957a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f06fe958dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f074a3f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f074f438609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f074f203353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06fd679897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f06fe5dc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f074a3f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f074f438609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f074f203353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default3]:[rank11]: grad_accumulator.backward(sum(activations)) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default3]:[rank11]: result = loss.backward() -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default3]:[rank11]: torch.autograd.backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default3]:[rank11]: _engine_run_backward( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default3]:[rank11]: return user_fn(self, *args) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default3]:[rank11]: pipeline_state.run_communication() -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default3]:[rank11]: self.grads_buffer.append(recv_grad()) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default3]:[rank11]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default3]:[rank11]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default3]:[rank11]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default3]:[rank11]: dist.recv( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default3]:[rank11]: return func(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default3]:[rank11]: pg.recv([tensor], group_src_rank, tag).wait() -[default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank9]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank9]: grad_accumulator.backward(sum(activations)) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank9]: result = loss.backward() -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank9]: torch.autograd.backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank9]: _engine_run_backward( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank9]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank9]: return user_fn(self, *args) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default1]:[rank9]: pipeline_state.run_communication() -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default1]:[rank9]: self.grads_buffer.append(recv_grad()) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default1]:[rank9]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank9]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank9]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank9]: dist.recv( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank9]: return func(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank9]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98cb6eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f98cc9c4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f98cc9c9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f98cc9cadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54becf7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f54bffd0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54bffd5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54bffd6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f550ba6fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5510ab6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5510881353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default2]:frame #4: + 0xd3e95 (0x7f9918463e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f991d4aa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f991d275353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98cb6eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f98cc9c4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54becf7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f98cc9c9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f98cc9cadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f9918463e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f991d4aa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f991d275353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f54bffd0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54bffd5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f98cb6eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f98cc64e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f9918463e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f991d4aa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f991d275353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54bffd6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f550ba6fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5510ab6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5510881353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54becf7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f54bfc5a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f550ba6fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f5510ab6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f5510881353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank56]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank56]: pipeline_state.run_communication() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank56]: recv_activation_tensor = recv_activation() -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank56]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank56]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank56]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank56]: dist.recv( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank56]: return func(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank56]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank56]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f98be1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1f99ebac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1f99ebfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1f99ec0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1fe5959e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1fea9a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1fea76b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f98be1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1f99ebac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1f99ebfa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1f99ec0dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1fe5959e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1fea9a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1fea76b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f98be1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f1f99b44119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f1fe5959e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f1fea9a0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f1fea76b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f49cfda6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f49d107fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f49d1084a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f49d1085dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4a1cb1ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4a21b65609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4a21930353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f49cfda6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f49d107fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f49d1084a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f49d1085dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4a1cb1ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4a21b65609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4a21930353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f49cfda6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f49d0d09119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f4a1cb1ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f4a21b65609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4a21930353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffab757d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffab8856c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffab885ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffab885cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ffb042f5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ffb0933c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ffb09107353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffab757d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffab8856c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffab885ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffab885cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ffb042f5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ffb0933c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ffb09107353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffab757d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7ffab84e0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7ffb042f5e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7ffb0933c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7ffb09107353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank60]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank60]: pipeline_state.run_communication() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank60]: recv_activation_tensor = recv_activation() -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank60]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank60]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank60]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank60]: dist.recv( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank60]: return func(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank60]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank60]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63d8479897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f63d9752c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f63d9757a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f63d9758dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f64251f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f642a238609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f642a003353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63d8479897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f63d9752c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f63d9757a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f63d9758dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f64251f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f642a238609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f642a003353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f63d8479897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f63d93dc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f64251f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f642a238609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f642a003353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe3eab47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe3ebe20c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe3ebe25a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe3ebe26dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe4378bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe43c906609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe43c6d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe3eab47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe3ebe20c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe3ebe25a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe3ebe26dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fe4378bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fe43c906609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fe43c6d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe3eab47897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fe3ebaaa119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fe4378bfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fe43c906609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fe43c6d1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank12]: grad_accumulator.backward(sum(activations)) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank12]: result = loss.backward() -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank12]: torch.autograd.backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank12]: _engine_run_backward( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank12]: return user_fn(self, *args) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank12]: pipeline_state.run_communication() -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank12]: self.grads_buffer.append(recv_grad()) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank12]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank12]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank12]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank12]: dist.recv( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank12]: return func(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank12]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank12]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f710b238897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f710c511c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f710c516a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f710c517dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7157fb0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f715cff7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f715cdc2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f710b238897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f710c511c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f710c516a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f710c517dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f7157fb0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f715cff7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f715cdc2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f710b238897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f710c19b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f7157fb0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f715cff7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f715cdc2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3766b0e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3767de7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3767deca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3767deddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f37b3886e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f37b88cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f37b8698353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3766b0e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3767de7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3767deca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3767deddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f37b3886e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f37b88cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f37b8698353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3766b0e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f3767a71119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f37b3886e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f37b88cd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f37b8698353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f407ab8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f407be67c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f407be6ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f407be6ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f40c7906e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f40cc94d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f40cc718353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f407ab8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f407be67c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f407be6ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f407be6ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f40c7906e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f40cc94d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f40cc718353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f407ab8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f407baf1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f40c7906e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f40cc94d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f40cc718353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4fe3fc7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4fe52a0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4fe52a5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4fe52a6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5030d3fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5035d86609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5035b51353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4fe3fc7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4fe52a0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4fe52a5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4fe52a6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f5030d3fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f5035d86609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f5035b51353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4fe3fc7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4fe4f2a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f5030d3fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f5035d86609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f5035b51353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8d2308d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8d24366c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8d2436ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8d2436cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f8d6fe05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f8d74e4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8d74c17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8d2308d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8d24366c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8d2436ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8d2436cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f8d6fe05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f8d74e4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f8d74c17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8d2308d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f8d23ff0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f8d6fe05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f8d74e4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f8d74c17353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f969b5a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f969c881c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f969c886a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f969c887dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f96e8320e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f96ed367609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f96ed132353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f969b5a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f969c881c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f969c886a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f969c887dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f96e8320e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f96ed367609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f96ed132353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f969b5a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f969c50b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f96e8320e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f96ed367609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f96ed132353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default5]:[rank5]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default5]:[rank5]: grad_accumulator.backward(sum(activations)) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default5]:[rank5]: result = loss.backward() -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default5]:[rank5]: torch.autograd.backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default5]:[rank5]: _engine_run_backward( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default5]:[rank5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default5]:[rank5]: return user_fn(self, *args) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default5]:[rank5]: pipeline_state.run_communication() -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default5]:[rank5]: self.grads_buffer.append(recv_grad()) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default5]:[rank5]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank5]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank5]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank5]: dist.recv( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank5]: return func(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank5]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank5]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default4]:[rank4]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default4]:[rank4]: grad_accumulator.backward(sum(activations)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default4]:[rank4]: result = loss.backward() -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default4]:[rank4]: torch.autograd.backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default4]:[rank4]: _engine_run_backward( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default4]:[rank4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default4]:[rank4]: return user_fn(self, *args) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 40, in backward -[default4]:[rank4]: pipeline_state.run_communication() -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 172, in run_communication -[default4]:[rank4]: self.grads_buffer.append(recv_grad()) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 50, in __call__ -[default4]:[rank4]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank4]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank4]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank4]: dist.recv( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank4]: return func(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank4]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank4]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 0. -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank45]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank45]: pipeline_state.run_communication() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank45]: recv_activation_tensor = recv_activation() -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank45]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank45]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank45]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank45]: dist.recv( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank45]: return func(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank45]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank45]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa9815b1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa98288ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa98288fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa982890dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa9ce329e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa9d3370609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa9d313b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa9815b1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa98288ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa98288fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa982890dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa9ce329e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa9d3370609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa9d313b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa9815b1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fa982514119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fa9ce329e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fa9d3370609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fa9d313b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank44]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank44]: pipeline_state.run_communication() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank44]: recv_activation_tensor = recv_activation() -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank44]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank44]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank44]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank44]: dist.recv( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank44]: return func(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank44]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank44]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc563507897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc5647e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc5647e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc5647e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc5b027fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc5b52c6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc5b5091353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc563507897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc5647e0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc5647e5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc5647e6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fc5b027fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fc5b52c6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fc5b5091353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc563507897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fc56446a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fc5b027fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fc5b52c6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fc5b5091353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab9d4fe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab9e7d7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab9e7dca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab9e7dddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fabea276e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fabef2bd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fabef088353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab9d4fe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fab9e7d7c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fab9e7dca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fab9e7dddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fabea276e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fabef2bd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fabef088353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fab9d4fe897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fab9e461119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fabea276e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fabef2bd609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fabef088353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank41]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank41]: pipeline_state.run_communication() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank41]: recv_activation_tensor = recv_activation() -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank41]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank41]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank41]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank41]: dist.recv( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank41]: return func(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank41]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank41]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f720b4e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f720c7bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f720c7c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f720c7c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f725825be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f725d2a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f725d06d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f720b4e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f720c7bcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f720c7c1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f720c7c2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f725825be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f725d2a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f725d06d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f720b4e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f720c446119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f725825be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f725d2a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f725d06d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0022d65897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f002403ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0024043a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0024044dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f006fadde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0074b24609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f00748ef353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0022d65897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f002403ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0024043a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0024044dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f006fadde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f0074b24609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f00748ef353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0022d65897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f0023cc8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f006fadde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f0074b24609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f00748ef353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9209f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb921ccec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb921cd3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb921cd4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb96d76de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb9727b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb97257f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9209f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb921ccec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb921cd3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb921cd4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb96d76de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb9727b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb97257f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9209f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fb921958119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb96d76de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb9727b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb97257f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac7153c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fac72815c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fac7281aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fac7281bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7facbe2b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7facc32fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7facc30c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac7153c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fac72815c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fac7281aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fac7281bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7facbe2b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7facc32fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7facc30c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fac7153c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fac7249f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7facbe2b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7facc32fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7facc30c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default5]:[rank37]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default5]:[rank37]: pipeline_state.run_communication() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default5]:[rank37]: recv_activation_tensor = recv_activation() -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default5]:[rank37]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default5]:[rank37]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default5]:[rank37]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default5]:[rank37]: dist.recv( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default5]:[rank37]: return func(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default5]:[rank37]: pg.recv([tensor], group_src_rank, tag).wait() -[default5]:[rank37]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f96f9da1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f96fb07ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f96fb07fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f96fb080dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f9746b19e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f974bb60609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f974b92b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f96f9da1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f96fb07ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f96fb07fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f96fb080dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f9746b19e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f974bb60609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f974b92b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f96f9da1897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f96fad04119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f9746b19e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f974bb60609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f974b92b353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c89088897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c8a361c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c8a366a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c8a367dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3cd5e00e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3cdae47609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3cdac12353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c89088897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c8a361c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c8a366a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c8a367dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3cd5e00e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3cdae47609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3cdac12353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c89088897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3c89feb119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f3cd5e00e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f3cdae47609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f3cdac12353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default4]:[rank36]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default4]:[rank36]: pipeline_state.run_communication() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default4]:[rank36]: recv_activation_tensor = recv_activation() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default4]:[rank36]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default4]:[rank36]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default4]:[rank36]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default4]:[rank36]: dist.recv( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default4]:[rank36]: return func(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default4]:[rank36]: pg.recv([tensor], group_src_rank, tag).wait() -[default4]:[rank36]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffa7f156897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffa8042fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffa80434a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffa80435dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ffacbecee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ffad0f15609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ffad0ce0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffa7f156897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffa8042fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffa80434a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffa80435dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ffacbecee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ffad0f15609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ffad0ce0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffa7f156897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7ffa800b9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7ffacbecee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7ffad0f15609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7ffad0ce0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8fb4936897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8fb5c0fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8fb5c14a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8fb5c15dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f90016aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f90066f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f90064c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8fb4936897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8fb5c0fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8fb5c14a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8fb5c15dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f90016aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f90066f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f90064c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8fb4936897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f8fb5899119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f90016aee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f90066f5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f90064c0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b7f63b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b80914c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b80919a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b8091adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f1bcc3b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f1bd13fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f1bd11c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600097 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b7f63b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1b80914c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1b80919a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1b8091adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f1bcc3b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f1bd13fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f1bd11c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1b7f63b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f1b8059e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f1bcc3b3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f1bd13fa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f1bd11c5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default0]:[rank32]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default0]:[rank32]: pipeline_state.run_communication() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default0]:[rank32]: recv_activation_tensor = recv_activation() -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default0]:[rank32]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default0]:[rank32]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default0]:[rank32]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default0]:[rank32]: dist.recv( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank32]: return func(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default0]:[rank32]: pg.recv([tensor], group_src_rank, tag).wait() -[default0]:[rank32]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9fd210897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb9fe4e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb9fe4eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb9fe4efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fba49f88e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fba4efcf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fba4ed9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9fd210897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb9fe4e9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb9fe4eea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb9fe4efdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fba49f88e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fba4efcf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fba4ed9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb9fd210897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fb9fe173119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fba49f88e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fba4efcf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fba4ed9a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f71e0661897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f71e193ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f71e193fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f71e1940dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f722d3d9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f7232420609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f72321eb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f71e0661897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f71e193ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f71e193fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f71e1940dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f722d3d9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f7232420609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f72321eb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f71e0661897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f71e15c4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f722d3d9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f7232420609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f72321eb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcb60fa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcb73d3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcb73d8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcb73d9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdd02e72e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdd07eb9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdd07c84353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcb60fa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcb73d3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcb73d8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcb73d9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fdd02e72e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fdd07eb9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fdd07c84353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcb60fa897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fdcb705d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fdd02e72e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fdd07eb9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fdd07c84353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default1]:[rank57]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default1]:[rank57]: pipeline_state.run_communication() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default1]:[rank57]: recv_activation_tensor = recv_activation() -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default1]:[rank57]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default1]:[rank57]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default1]:[rank57]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default1]:[rank57]: dist.recv( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default1]:[rank57]: return func(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default1]:[rank57]: pg.recv([tensor], group_src_rank, tag).wait() -[default1]:[rank57]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward -[default6]:[rank54]: new_kwargs[name] = recv_from_pipeline_state_buffer( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer -[default6]:[rank54]: pipeline_state.run_communication() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 150, in run_communication -[default6]:[rank54]: recv_activation_tensor = recv_activation() -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 31, in __call__ -[default6]:[rank54]: return self.p2p.recv_tensors(num_tensors=1, from_rank=self.from_rank)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 353, in recv_tensors -[default6]:[rank54]: buffers, futures = self.irecv_tensors(num_tensors=num_tensors, from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 326, in irecv_tensors -[default6]:[rank54]: meta = self._recv_meta(from_rank=from_rank, tag=tag) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 269, in _recv_meta -[default6]:[rank54]: dist.recv( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default6]:[rank54]: return func(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1932, in recv -[default6]:[rank54]: pg.recv([tensor], group_src_rank, tag).wait() -[default6]:[rank54]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf63538897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf64811c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf64816a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf64817dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fdfb02b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fdfb52f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fdfb50c2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf63538897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdf64811c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdf64816a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdf64817dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fdfb02b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fdfb52f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fdfb50c2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdf63538897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fdf6449b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fdfb02b0e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fdfb52f7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fdfb50c2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66746ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6675985c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f667598aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f667598bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f66c1424e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f66c646b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f66c6236353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default3]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600094 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66746ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6675985c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f667598aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f667598bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c24d3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #4: + 0xd3e95 (0x7f66c1424e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f66c646b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f66c6236353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c26015c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f66746ac897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f667560f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f66c1424e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f66c646b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f66c6236353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c2601aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c2601bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f4c71ab4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f4c76afb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4c768c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c24d3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4c26015c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4c2601aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4c2601bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f4c71ab4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f4c76afb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f4c768c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4c24d3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f4c25c9f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f4c71ab4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f4c76afb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f4c768c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4dc7b1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4dc8df6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4dc8dfba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4dc8dfcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f4e14895e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f4e198dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f4e196a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600090 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4dc7b1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4dc8df6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4dc8dfba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4dc8dfcdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f4e14895e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f4e198dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f4e196a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4dc7b1d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f4dc8a80119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f4e14895e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f4e198dc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f4e196a7353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f430e5e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f430f8c0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f430f8c5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f430f8c6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f435b35fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f43603a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f4360171353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f430e5e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f430f8c0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f430f8c5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f430f8c6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f435b35fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f43603a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f4360171353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f430e5e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f430f54a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f435b35fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f43603a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f4360171353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f544edda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f54500b3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54500b8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54500b9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f549bb52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f54a0b99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f54a0964353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f544edda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f54500b3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54500b8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54500b9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f549bb52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f54a0b99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f54a0964353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f544edda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f544fd3d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f549bb52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f54a0b99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f54a0964353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7395d8d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7397066c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f739706ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f739706cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f73e2b05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f73e7b4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f73e7917353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7395d8d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7397066c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f739706ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f739706cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f73e2b05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f73e7b4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f73e7917353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7395d8d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f7396cf0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f73e2b05e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f73e7b4c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f73e7917353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2611fc0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2613299c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f261329ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f261329fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #4: + 0xd3e95 (0x7f265ed38e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f2663d7f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f2663b4a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f086c1d3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f086d4acc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f086d4b1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f086d4b2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f08b8f4be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f08bdf92609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]:frame #6: clone + 0x43 (0x7f08bdd5d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f086c1d3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f086d4acc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f086d4b1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f086d4b2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f08b8f4be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2611fc0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #5: + 0x8609 (0x7f08bdf92609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f08bdd5d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f086c1d3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f086d136119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2613299c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f261329ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f261329fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f265ed38e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f2663d7f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f2663b4a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #2: + 0xd3e95 (0x7f08b8f4be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f08bdf92609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f08bdd5d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2611fc0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f2612f23119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: -[default1]:frame #2: + 0xd3e95 (0x7f265ed38e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f2663d7f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f2663b4a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7f81d4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7f83027c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7f8302ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7f8302ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7fceac6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f7fd3b0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7fd38d8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7f81d4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7f83027c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7f8302ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7f8302ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f7fceac6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f7fd3b0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f7fd38d8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7f81d4e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f7f82cb1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f7fceac6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f7fd3b0d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f7fd38d8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa3f4e34897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa3f610dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa3f6112a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa3f6113dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa441bace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa446bf3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa4469be353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa3f4e34897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa3f610dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa3f6112a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa3f6113dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fa441bace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fa446bf3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fa4469be353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa3f4e34897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fa3f5d97119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fa441bace95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fa446bf3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fa4469be353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1b12d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1b25aec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1b25b3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1b25b4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fb1fe04de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb203094609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fb202e5f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1b12d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb1b25aec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb1b25b3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb1b25b4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fb1fe04de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fb203094609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fb202e5f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb1b12d5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fb1b2238119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fb1fe04de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fb203094609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fb202e5f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 0] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05ffdad897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0601086c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f060108ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f060108cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f064cb25e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f0651b6c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f0651937353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=32768, NumelOut=32768, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05ffdad897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0601086c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f060108ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f060108cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f064cb25e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f0651b6c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f0651937353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f05ffdad897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f0600d10119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f064cb25e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f0651b6c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f0651937353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcd42a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcd557fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcd5584a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcd5585dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fdd2101ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fdd26065609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fdd25e30353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcd42a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdcd557fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdcd5584a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdcd5585dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fdd2101ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fdd26065609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fdd25e30353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdcd42a6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fdcd5209119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fdd2101ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fdd26065609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fdd25e30353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc7c7f6f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc7c9248c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc7c924da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc7c924edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fc814ce7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fc819d2e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fc819af9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc7c7f6f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc7c9248c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc7c924da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc7c924edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fc814ce7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fc819d2e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fc819af9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc7c7f6f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fc7c8ed2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fc814ce7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fc819d2e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fc819af9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f69dcce3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f69ddfbcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f69ddfc1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f69ddfc2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6a29a5be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f6a2eaa2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6a2e86d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f69dcce3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f69ddfbcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f69ddfc1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f69ddfc2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f6a29a5be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f6a2eaa2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f6a2e86d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f69dcce3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f69ddc46119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f6a29a5be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f6a2eaa2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f6a2e86d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f020c376897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f020d64fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f020d654a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f020d655dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f02590eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f025e135609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f025df00353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f020c376897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f020d64fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f020d654a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f020d655dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f02590eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f025e135609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f025df00353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f020c376897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f020d2d9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f02590eee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f025e135609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f025df00353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b235fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5b248d4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5b248d9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5b248dadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5b70373e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5b753ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5b75185353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600044 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b235fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5b248d4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5b248d9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5b248dadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f5b70373e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f5b753ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f5b75185353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5b235fb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f5b2455e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f5b70373e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f5b753ba609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f5b75185353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4af9e71897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4afb14ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4afb14fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4afb150dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4b46be9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4b4bc30609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f4b4b9fb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4af9e71897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4afb14ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4afb14fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4afb150dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4b46be9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f4b4bc30609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f4b4b9fb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4af9e71897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f4afadd4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f4b46be9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f4b4bc30609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f4b4b9fb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1537] [PG 4 Rank 1] Timeout at NCCL work: 15, last enqueued NCCL work: 16, last completed NCCL work: 14. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:1414] [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1a9655897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff1aa92ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff1aa933a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff1aa934dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff1f63cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff1fb414609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff1fb1df353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 4 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=15, OpType=SEND, NumelIn=4194304, NumelOut=4194304, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1a9655897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff1aa92ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff1aa933a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff1aa934dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7ff1f63cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7ff1fb414609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7ff1fb1df353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff1a9655897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7ff1aa5b8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7ff1f63cde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7ff1fb414609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7ff1fb1df353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -W0703 04:02:14.069000 140490932283200 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 908999 closing signal SIGTERM -W0703 04:02:14.070000 140490932283200 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 909000 closing signal SIGTERM -W0703 04:02:14.070000 140490932283200 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 909002 closing signal SIGTERM -W0703 04:02:14.070000 140490932283200 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 909003 closing signal SIGTERM -W0703 04:02:14.070000 140490932283200 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 909004 closing signal SIGTERM -W0703 04:02:14.070000 140490932283200 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 909005 closing signal SIGTERM -W0703 04:02:14.070000 140490932283200 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 909006 closing signal SIGTERM -E0703 04:02:15.872000 140490932283200 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 909001) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:02:14 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 909001) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 909001 -============================================================ -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -W0703 04:02:19.048000 139800098445120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1171225 closing signal SIGTERM -W0703 04:02:19.048000 139800098445120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1171226 closing signal SIGTERM -W0703 04:02:19.048000 139800098445120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1171228 closing signal SIGTERM -W0703 04:02:19.048000 139800098445120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1171229 closing signal SIGTERM -W0703 04:02:19.048000 139800098445120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1171230 closing signal SIGTERM -W0703 04:02:19.048000 139800098445120 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1171232 closing signal SIGTERM -W0703 04:02:19.069000 140651166279488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 895325 closing signal SIGTERM -W0703 04:02:19.069000 140651166279488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 895327 closing signal SIGTERM -W0703 04:02:19.069000 140651166279488 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 895328 closing signal SIGTERM -W0703 04:02:19.076000 140659577317184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1446764 closing signal SIGTERM -W0703 04:02:19.076000 140659577317184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1446765 closing signal SIGTERM -W0703 04:02:19.076000 140659577317184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1446768 closing signal SIGTERM -W0703 04:02:19.076000 140659577317184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1446770 closing signal SIGTERM -W0703 04:02:19.093000 139961753220928 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3791311 closing signal SIGTERM -W0703 04:02:19.093000 139961753220928 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3791315 closing signal SIGTERM -W0703 04:02:19.146000 139907187222336 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3920591 closing signal SIGTERM -E0703 04:02:19.177000 140400945293120 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 691273) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 04:02:19.204000 140394431031104 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 20208) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-138.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 691274) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 691274 -[2]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-138.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 691275) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 691275 -[3]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-138.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 691276) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 691276 -[4]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-138.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 691277) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 691277 -[5]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-138.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 691278) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 691278 -[6]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-138.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 691279) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 691279 -[7]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-138.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 691280) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 691280 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-138.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 691273) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 691273 -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:02:19 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 20209) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 20209 -[2]: - time : 2024-07-03_04:02:19 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 20210) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 20210 -[3]: - time : 2024-07-03_04:02:19 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 20211) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 20211 -[4]: - time : 2024-07-03_04:02:19 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 20212) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 20212 -[5]: - time : 2024-07-03_04:02:19 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 20213) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 20213 -[6]: - time : 2024-07-03_04:02:19 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 20214) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 20214 -[7]: - time : 2024-07-03_04:02:19 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 20215) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 20215 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:02:19 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 20208) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 20208 -============================================================ -E0703 04:02:19.621000 139907187222336 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3920586) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 04:02:19.639000 139907187222336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3920517_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:19.668000 139907187222336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3920517_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:19.695000 139907187222336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3920517_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : -6 (pid: 3920587) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3920587 -[2]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-62.ec2.internal - rank : 50 (local_rank: 2) - exitcode : -6 (pid: 3920588) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3920588 -[3]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : -6 (pid: 3920589) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3920589 -[4]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : -6 (pid: 3920590) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3920590 -[5]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : -6 (pid: 3920592) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3920592 -[6]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-62.ec2.internal - rank : 55 (local_rank: 7) - exitcode : -6 (pid: 3920593) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3920593 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : -6 (pid: 3920586) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3920586 -============================================================ -E0703 04:02:19.775000 139961753220928 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3791310) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 04:02:19.792000 139961753220928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3791242_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:19.821000 139961753220928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3791242_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:19.843000 139961753220928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3791242_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-102.ec2.internal - rank : 42 (local_rank: 2) - exitcode : -6 (pid: 3791313) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3791313 -[2]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-102.ec2.internal - rank : 43 (local_rank: 3) - exitcode : -6 (pid: 3791314) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3791314 -[3]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-102.ec2.internal - rank : 45 (local_rank: 5) - exitcode : -6 (pid: 3791316) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3791316 -[4]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-102.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 3791317) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3791317 -[5]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-102.ec2.internal - rank : 47 (local_rank: 7) - exitcode : -6 (pid: 3791318) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3791318 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:02:19 - host : ip-26-0-171-102.ec2.internal - rank : 40 (local_rank: 0) - exitcode : -6 (pid: 3791310) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3791310 -============================================================ -E0703 04:02:20.083000 140659577317184 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 1446766) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 04:02:20.099000 140659577317184 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1446695_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:20.128000 140659577317184 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1446695_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:20.147000 140659577317184 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1446695_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-153.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 1446767) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1446767 -[2]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-153.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 1446769) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1446769 -[3]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-153.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 1446771) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1446771 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-153.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 1446766) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1446766 -============================================================ -E0703 04:02:20.182000 140651166279488 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 895324) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 04:02:20.198000 140651166279488 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_895255_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:20.226000 140651166279488 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_895255_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:20.246000 140651166279488 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-103.ec2.internal_895255_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 895326) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 895326 -[2]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 895329) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 895329 -[3]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 895330) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 895330 -[4]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 895331) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 895331 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 895324) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 895324 -============================================================ -srun: error: ip-26-0-161-138: task 3: Exited with exit code 1 -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -E0703 04:02:20.568000 139800098445120 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 1171227) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 04:02:20.584000 139800098445120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1171156_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:20.620000 139800098445120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1171156_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:02:20.631000 139800098445120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1171156_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-78.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 1171231) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1171231 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:02:19 - host : ip-26-0-161-78.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 1171227) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1171227 -============================================================ -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 4: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/status.txt b/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-16_pp-2_mbz-8/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/bench.slurm deleted file mode 100644 index 69c317fa852ef325a5eb7c8ad3291e0f3b659026..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/config.yaml deleted file mode 100644 index 0793a1d2de6b9db6b5a27f6b81cbc6e0f2479c0a..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 512 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 1 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/log.out deleted file mode 100644 index b34440d1e68c00419ecbb3ec0fbb6220c9c348c6..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/log.out +++ /dev/null @@ -1,2234 +0,0 @@ -======================== -START TIME: Wed Jul 3 10:20:34 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 10:20:37.115000 139815041251136 torch/distributed/run.py:757] -W0703 10:20:37.115000 139815041251136 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.115000 139815041251136 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:20:37.115000 139815041251136 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.114000 140147849172800 torch/distributed/run.py:757] -W0703 10:20:37.114000 140147849172800 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.114000 140147849172800 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:20:37.114000 140147849172800 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.113000 139830256518976 torch/distributed/run.py:757] -W0703 10:20:37.113000 139830256518976 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.113000 139830256518976 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:20:37.113000 139830256518976 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.120000 140282513299264 torch/distributed/run.py:757] -W0703 10:20:37.120000 140282513299264 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.120000 140282513299264 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:20:37.120000 140282513299264 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.123000 140716396173120 torch/distributed/run.py:757] -W0703 10:20:37.123000 140716396173120 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.123000 140716396173120 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:20:37.123000 140716396173120 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.130000 140233945896768 torch/distributed/run.py:757] -W0703 10:20:37.130000 140233945896768 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.130000 140233945896768 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:20:37.130000 140233945896768 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.152000 139884516067136 torch/distributed/run.py:757] -W0703 10:20:37.152000 139884516067136 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.152000 139884516067136 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:20:37.152000 139884516067136 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.160000 139944327452480 torch/distributed/run.py:757] -W0703 10:20:37.160000 139944327452480 torch/distributed/run.py:757] ***************************************** -W0703 10:20:37.160000 139944327452480 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:20:37.160000 139944327452480 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 10:20:57 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=32, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=512, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1')), -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 10:20:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default2]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=26|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=25|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=27|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=29|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=28|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=24|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=0|ip-26-0-169-132]: No checkpoint path provided. -[default1]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=17|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=16|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=7|ip-26-0-169-132]: No checkpoint path provided. -[default5]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=21|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=18|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=19|ip-26-0-172-57]: No checkpoint path provided. -[default5]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=5|ip-26-0-169-132]: No checkpoint path provided. -[default6]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=30|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=1|ip-26-0-169-132]: No checkpoint path provided. -[default4]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=4|ip-26-0-169-132]: No checkpoint path provided. -[default2]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=2|ip-26-0-169-132]: No checkpoint path provided. -[default3]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=3|ip-26-0-169-132]: No checkpoint path provided. -[default6]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=6|ip-26-0-169-132]: No checkpoint path provided. -[default7]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=23|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=20|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=22|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 10:21:15 [INFO|DP=1|PP=0|TP=31|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=28|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=24|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=24|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=24|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=31|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=31|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=31|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=30|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=28|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=28|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=30|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=30|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=27|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=27|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=27|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=25|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=25|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=26|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=26|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=26|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=25|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=29|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=29|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 10:21:15 [INFO|DP=0|PP=0|TP=29|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 10:21:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 10:21:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 10:21:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 10:21:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 10:21:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 10:21:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 10:21:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 10:21:18 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:21:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 10:21:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 10:21:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 10:21:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 10:21:19.643437 | mbs: 1 | grad_accum: 512 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 10:21:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 10:21:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default4]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=4|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=19|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=17|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=22|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=23|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=27|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=26|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=28|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=25|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=29|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=21|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=20|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=24|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=17|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=16|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=7|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=5|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=18|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=19|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=30|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=6|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=1|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=2|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=23|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=20|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=22|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=31|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=30|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=31|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=24|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=28|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=27|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=26|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=25|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=29|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=18|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:21:19 [WARNING|DP=0|PP=0|TP=16|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=0|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=21|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=3|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:21:19 [WARNING|DP=1|PP=0|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter -[default1]:[rank49]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward -[default1]:[rank49]: grad_accumulator.backward(sum(activations)) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward -[default1]:[rank49]: result = loss.backward() -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward -[default1]:[rank49]: torch.autograd.backward( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward -[default1]:[rank49]: _engine_run_backward( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward -[default1]:[rank49]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply -[default1]:[rank49]: return user_fn(self, *args) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 821, in backward -[default1]:[rank49]: dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 643, in _layer_norm_bwd -[default1]:[rank49]: _layer_norm_bwd_kernel[grid]( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in -[default1]:[rank49]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in run -[default1]:[rank49]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs} -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in -[default1]:[rank49]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs} -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 122, in _bench -[default1]:[rank49]: return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8)) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/testing.py", line 102, in do_bench -[default1]:[rank49]: fn() -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 110, in kernel_call -[default1]:[rank49]: self.fn.run( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run -[default1]:[rank49]: return self.fn.run(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run -[default1]:[rank49]: return self.fn.run(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run -[default1]:[rank49]: return self.fn.run(*args, **kwargs) -[default1]:[rank49]: [Previous line repeated 2 more times] -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run -[default1]:[rank49]: self.cache[device][key] = compile( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 202, in compile -[default1]:[rank49]: return CompiledKernel(so_path, metadata_group.get(metadata_filename)) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 230, in __init__ -[default1]:[rank49]: self.asm = { -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 231, in -[default1]:[rank49]: file.suffix[1:]: file.read_bytes() if file.suffix[1:] == driver.binary_ext else file.read_text() -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/pathlib.py", line 1134, in read_text -[default1]:[rank49]: with self.open(mode='r', encoding=encoding, errors=errors) as f: -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/pathlib.py", line 1119, in open -[default1]:[rank49]: return self._accessor.open(self, mode, buffering, encoding, errors, -[default1]:[rank49]: FileNotFoundError: [Errno 2] No such file or directory: '/admin/home/ferdinand_mom/.triton/cache/277d2c21bde4aaafc772feeb311c2fa1/_layer_norm_bwd_kernel.cubin.tmp.pid_1912744_288389' -[default1]:07/03/2024 10:21:30 [WARNING|DP=0|PP=0|TP=25|ip-26-0-168-238]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:07/03/2024 10:21:30 [WARNING|DP=0|PP=0|TP=25|ip-26-0-168-238]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default1]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default7]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]:07/03/2024 10:22:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 365.65MiB. Peak allocated 1586.86MiB. Peak reserved: 1670.00MiB -[default4]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 19] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600012 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 29] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 4] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:577] [Rank 4] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:583] [Rank 4] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 4] Process group watchdog thread terminated with exception: [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06eb1a3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f06ec47cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f06ec481a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f06ec482dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f0737f1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f073cf62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f073cd2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 4] Process group watchdog thread terminated with exception: [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600083 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06eb1a3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f06ec47cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f06ec481a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f06ec482dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f0737f1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f073cf62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f073cd2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f06eb1a3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f06ec106119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f0737f1be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f073cf62609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f073cd2d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 7] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:577] [Rank 7] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:583] [Rank 7] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 7] Process group watchdog thread terminated with exception: [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0139d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe014cb0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe014cb5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe014cb6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe06074fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe065796609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe065561353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 7] Process group watchdog thread terminated with exception: [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0139d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe014cb0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe014cb5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe014cb6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fe06074fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fe065796609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fe065561353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe0139d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fe01493a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fe06074fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fe065796609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fe065561353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 2] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4dff921897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e00bfac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e00bffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e00c00dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4e4c699e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4e516e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4e514ab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4dff921897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4e00bfac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4e00bffa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4e00c00dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f4e4c699e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f4e516e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f4e514ab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4dff921897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f4e00884119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f4e4c699e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f4e516e0609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f4e514ab353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 1] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f4b82e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9f4cb07c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9f4cb0ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9f4cb0ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9f985a6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9f9d5ed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9f9d3b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f4b82e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9f4cb07c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9f4cb0ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9f4cb0ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9f985a6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9f9d5ed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9f9d3b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9f4b82e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f9f4c791119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f9f985a6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f9f9d5ed609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f9f9d3b8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 6] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:577] [Rank 6] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:583] [Rank 6] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 5] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:577] [Rank 5] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 6] Process group watchdog thread terminated with exception: [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:583] [Rank 5] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 5] Process group watchdog thread terminated with exception: [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0f3fc30897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0f40f09c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb53d8a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb53eb7ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb53eb83a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb53eb84dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb58a61de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb58f664609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb58f42f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0f40f0ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0f40f0fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:frame #4: + 0xd3e95 (0x7f0f8c9a8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f0f919ef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f0f917ba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 5] Process group watchdog thread terminated with exception: [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default6]: what(): [PG 2 Rank 6] Process group watchdog thread terminated with exception: [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb53d8a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb53eb7ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb53eb83a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0f3fc30897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0f40f09c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0f40f0ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb53eb84dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb58a61de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0f40f0fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7fb58f664609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb58f42f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #4: + 0xd3e95 (0x7f0f8c9a8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]: -[default5]:frame #5: + 0x8609 (0x7f0f919ef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #6: clone + 0x43 (0x7f0f917ba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb53d8a5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]: -[default6]:frame #1: + 0xe32119 (0x7fb53e808119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb58a61de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb58f664609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0f3fc30897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #4: clone + 0x43 (0x7fb58f42f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #1: + 0xe32119 (0x7f0f40b93119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f0f8c9a8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f0f919ef609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]: -[default5]:frame #4: clone + 0x43 (0x7f0f917ba353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 0] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb68f2a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb690581c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb690586a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb690587dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb6dc020e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb6e1067609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb6e0e32353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb68f2a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb690581c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb690586a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb690587dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb6dc020e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb6e1067609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb6e0e32353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb68f2a8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fb69020b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fb6dc020e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fb6e1067609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fb6e0e32353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 3] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fae83d20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fae84ff9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fae84ffea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fae84fffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7faed0a98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7faed5adf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7faed58aa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600010 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fae83d20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fae84ff9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fae84ffea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fae84fffdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7faed0a98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7faed5adf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7faed58aa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fae83d20897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fae84c83119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7faed0a98e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7faed5adf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7faed58aa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 29] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:577] [Rank 29] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:583] [Rank 29] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 29] Process group watchdog thread terminated with exception: [Rank 29] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd4401b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd44148dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd441492a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd441493dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd48cf2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd491f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd491d3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 29] Process group watchdog thread terminated with exception: [Rank 29] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd4401b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd44148dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd441492a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd441493dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fd48cf2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fd491f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fd491d3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd4401b4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fd441117119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fd48cf2ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fd491f73609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fd491d3e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 25] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:577] [Rank 25] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:583] [Rank 25] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 25] Process group watchdog thread terminated with exception: [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ed76a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3ed8979c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3ed897ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3ed897fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3f24418e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3f2945f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3f2922a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 25] Process group watchdog thread terminated with exception: [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ed76a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3ed8979c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3ed897ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3ed897fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f3f24418e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f3f2945f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f3f2922a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3ed76a0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f3ed8603119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f3f24418e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f3f2945f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f3f2922a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 31] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 24] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 27] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:577] [Rank 27] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:577] [Rank 31] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:577] [Rank 24] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:583] [Rank 27] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:583] [Rank 31] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:583] [Rank 24] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 31] Process group watchdog thread terminated with exception: [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 27] Process group watchdog thread terminated with exception: [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 24] Process group watchdog thread terminated with exception: [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58fdcc8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f50c8cdf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2038200897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f58fefa1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f58fefa6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f20394d9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f50c9fb8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f58fefa7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f20394dea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f20394dfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f594aa40e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f50c9fbda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f2084f78e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f594fa87609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f50c9fbedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #5: + 0x8609 (0x7f2089fbf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: + 0xd3e95 (0x7f5115a57e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #6: clone + 0x43 (0x7f594f852353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #5: + 0x8609 (0x7f511aa9e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f2089d8a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default3]:frame #6: clone + 0x43 (0x7f511a869353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 31] Process group watchdog thread terminated with exception: [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default3]: -[default0]: what(): [PG 2 Rank 24] Process group watchdog thread terminated with exception: [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58fdcc8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2038200897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f58fefa1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: what(): [PG 2 Rank 27] Process group watchdog thread terminated with exception: [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600038 milliseconds before timing out. -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f20394d9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f58fefa6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f58fefa7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f20394dea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f594aa40e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f50c8cdf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #5: + 0x8609 (0x7f594fa87609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f20394dfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #6: clone + 0x43 (0x7f594f852353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #4: + 0xd3e95 (0x7f2084f78e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f50c9fb8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #5: + 0x8609 (0x7f2089fbf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]: -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f50c9fbda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #6: clone + 0x43 (0x7f2089d8a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f50c9fbedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #4: + 0xd3e95 (0x7f5115a57e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f58fdcc8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #5: + 0x8609 (0x7f511aa9e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2038200897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f58fec2b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #6: clone + 0x43 (0x7f511a869353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #2: + 0xd3e95 (0x7f594aa40e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #1: + 0xe32119 (0x7f2039163119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: -[default0]:frame #2: + 0xd3e95 (0x7f2084f78e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #3: + 0x8609 (0x7f2089fbf609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f50c8cdf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #3: + 0x8609 (0x7f594fa87609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #1: + 0xe32119 (0x7f50c9c42119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: clone + 0x43 (0x7f594f852353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #2: + 0xd3e95 (0x7f5115a57e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #4: clone + 0x43 (0x7f2089d8a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]: -[default3]:frame #3: + 0x8609 (0x7f511aa9e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f511a869353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 26] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:577] [Rank 26] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:583] [Rank 26] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 26] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f27fe180897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f27ff459c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f27ff45ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f27ff45fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f284aef8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f284ff3f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f284fd0a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 30] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default2]: what(): [PG 2 Rank 26] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:577] [Rank 30] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:583] [Rank 30] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 28] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:577] [Rank 28] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 30] Process group watchdog thread terminated with exception: [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f27fe180897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f23b2fe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f23b42c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:583] [Rank 28] To avoid data inconsistency, we are taking the entire process down. -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f23b42c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f27ff459c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f23b42c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f27ff45ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f27ff45fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 28] Process group watchdog thread terminated with exception: [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #4: + 0xd3e95 (0x7f284aef8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #4: + 0xd3e95 (0x7f23ffd61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2404da8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #5: + 0x8609 (0x7f284ff3f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2404b73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f852d45e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f852e737c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f852e73ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: what(): [PG 2 Rank 30] Process group watchdog thread terminated with exception: [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #6: clone + 0x43 (0x7f284fd0a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f852e73ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f23b2fe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #4: + 0xd3e95 (0x7f857a1d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]: -[default4]:frame #5: + 0x8609 (0x7f857f21d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f23b42c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f23b42c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f27fe180897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f23b42c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #1: + 0xe32119 (0x7f27ff0e3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #6: clone + 0x43 (0x7f857efe8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #4: + 0xd3e95 (0x7f23ffd61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: -[default2]:frame #2: + 0xd3e95 (0x7f284aef8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2404da8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2404b73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #3: + 0x8609 (0x7f284ff3f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: -[default2]:frame #4: clone + 0x43 (0x7f284fd0a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: what(): [PG 2 Rank 28] Process group watchdog thread terminated with exception: [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f23b2fe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #1: + 0xe32119 (0x7f23b3f4c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f23ffd61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f852d45e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f852e737c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: + 0x8609 (0x7f2404da8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]: -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f852e73ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f852e73ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: clone + 0x43 (0x7f2404b73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default4]:frame #4: + 0xd3e95 (0x7f857a1d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f857f21d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f857efe8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f852d45e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f852e3c1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f857a1d6e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f857f21d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f857efe8353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 12] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:577] [Rank 12] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 10] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:577] [Rank 10] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:583] [Rank 10] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7c3a6c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:583] [Rank 12] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f443f74a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4440a23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4440a28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd7c4d45c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd7c4d4aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd7c4d4bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4440a29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f448c4c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #4: + 0xd3e95 (0x7fd8107e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fd81582b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fd8155f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:frame #5: + 0x8609 (0x7f4491509609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]: what(): [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7c3a6c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd7c4d45c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd7c4d4aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd7c4d4bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fd8107e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #6: clone + 0x43 (0x7f44912d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #5: + 0x8609 (0x7fd81582b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f443f74a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #6: clone + 0x43 (0x7fd8155f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4440a23c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4440a28a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd7c3a6c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4440a29dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #1: + 0xe32119 (0x7fd7c49cf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fd8107e4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fd81582b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: + 0xd3e95 (0x7f448c4c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f4491509609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f44912d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:frame #4: clone + 0x43 (0x7fd8155f6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f443f74a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f44406ad119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: -[default4]:frame #2: + 0xd3e95 (0x7f448c4c2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f4491509609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f44912d4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 9] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:577] [Rank 9] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:583] [Rank 9] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5771271897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f577254ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f577254fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5772550dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f57bdfe9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f57c3030609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f57c2dfb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5771271897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f577254ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f577254fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5772550dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f57bdfe9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f57c3030609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f57c2dfb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5771271897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f57721d4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f57bdfe9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f57c3030609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f57c2dfb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 15] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:577] [Rank 15] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:583] [Rank 15] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa83c8d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa83dbb2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa83dbb7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa83dbb8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa889651e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa88e698609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa88e463353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa83c8d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa83dbb2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa83dbb7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa83dbb8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa889651e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa88e698609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa88e463353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa83c8d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fa83d83c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fa889651e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fa88e698609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fa88e463353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 14] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:577] [Rank 14] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:583] [Rank 14] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feb41df8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feb430d1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feb430d6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feb430d7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7feb8eb70e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7feb93bb7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7feb93982353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feb41df8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feb430d1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feb430d6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feb430d7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7feb8eb70e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7feb93bb7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7feb93982353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7feb41df8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7feb42d5b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7feb8eb70e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7feb93bb7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7feb93982353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 8] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:577] [Rank 8] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:583] [Rank 8] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f608b951897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f608cc2ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f608cc2fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f608cc30dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f60d86c9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f60dd710609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f60dd4db353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f608b951897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f608cc2ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f608cc2fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f608cc30dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f60d86c9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f60dd710609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f60dd4db353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f608b951897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f608c8b4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f60d86c9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f60dd710609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f60dd4db353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 11] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:577] [Rank 11] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:583] [Rank 11] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f75af602897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f75b08dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f75b08e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f75b08e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f75fc37ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 13] Timeout at NCCL work: 154, last enqueued NCCL work: 154, last completed NCCL work: 153. -[default3]:frame #5: + 0x8609 (0x7f76013c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f760118c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f75af602897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f75b08dbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f75b08e0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f75b08e1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f75fc37ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f76013c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f760118c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f75af602897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:577] [Rank 13] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:frame #1: + 0xe32119 (0x7f75b0565119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f75fc37ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f76013c1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f760118c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:583] [Rank 13] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe5b8d94897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe5ba06dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe5ba072a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe5ba073dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe605b0ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe60ab53609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe60a91e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=154, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe5b8d94897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe5ba06dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe5ba072a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe5ba073dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe605b0ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe60ab53609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe60a91e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe5b8d94897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fe5b9cf7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fe605b0ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fe60ab53609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fe60a91e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -E0703 10:31:39.273000 140716396173120 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 982689) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:31:39 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 982690) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 982690 -[2]: - time : 2024-07-03_10:31:39 - host : ip-26-0-172-73.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 982691) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 982691 -[3]: - time : 2024-07-03_10:31:39 - host : ip-26-0-172-73.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 982692) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 982692 -[4]: - time : 2024-07-03_10:31:39 - host : ip-26-0-172-73.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 982693) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 982693 -[5]: - time : 2024-07-03_10:31:39 - host : ip-26-0-172-73.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 982694) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 982694 -[6]: - time : 2024-07-03_10:31:39 - host : ip-26-0-172-73.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 982695) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 982695 -[7]: - time : 2024-07-03_10:31:39 - host : ip-26-0-172-73.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 982696) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 982696 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:31:39 - host : ip-26-0-172-73.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 982689) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 982689 -============================================================ -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -W0703 10:31:44.048000 139830256518976 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1912743 closing signal SIGTERM -W0703 10:31:44.049000 139830256518976 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1912744 closing signal SIGTERM -W0703 10:31:44.049000 139830256518976 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1912745 closing signal SIGTERM -W0703 10:31:44.049000 139830256518976 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1912746 closing signal SIGTERM -W0703 10:31:44.049000 139830256518976 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1912747 closing signal SIGTERM -W0703 10:31:44.049000 139830256518976 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1912748 closing signal SIGTERM -W0703 10:31:44.049000 139830256518976 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1912750 closing signal SIGTERM -E0703 10:31:44.296000 140233945896768 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 2483342) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:31:44 - host : ip-26-0-169-132.ec2.internal - rank : 33 (local_rank: 1) - exitcode : -6 (pid: 2483343) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2483343 -[2]: - time : 2024-07-03_10:31:44 - host : ip-26-0-169-132.ec2.internal - rank : 34 (local_rank: 2) - exitcode : -6 (pid: 2483344) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2483344 -[3]: - time : 2024-07-03_10:31:44 - host : ip-26-0-169-132.ec2.internal - rank : 35 (local_rank: 3) - exitcode : -6 (pid: 2483345) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2483345 -[4]: - time : 2024-07-03_10:31:44 - host : ip-26-0-169-132.ec2.internal - rank : 36 (local_rank: 4) - exitcode : -6 (pid: 2483346) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2483346 -[5]: - time : 2024-07-03_10:31:44 - host : ip-26-0-169-132.ec2.internal - rank : 37 (local_rank: 5) - exitcode : -6 (pid: 2483347) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2483347 -[6]: - time : 2024-07-03_10:31:44 - host : ip-26-0-169-132.ec2.internal - rank : 38 (local_rank: 6) - exitcode : -6 (pid: 2483348) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2483348 -[7]: - time : 2024-07-03_10:31:44 - host : ip-26-0-169-132.ec2.internal - rank : 39 (local_rank: 7) - exitcode : -6 (pid: 2483349) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2483349 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:31:44 - host : ip-26-0-169-132.ec2.internal - rank : 32 (local_rank: 0) - exitcode : -6 (pid: 2483342) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 2483342 -============================================================ -srun: error: ip-26-0-169-132: task 5: Exited with exit code 1 -E0703 10:31:45.511000 139830256518976 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 6 (pid: 1912749) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:31:44 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : -6 (pid: 1912749) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1912749 -============================================================ -srun: error: ip-26-0-169-86: task 4: Exited with exit code 1 -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=101940, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 3 Rank 0] Timeout at NCCL work: 147, last enqueued NCCL work: 147, last completed NCCL work: 146. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 26] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 26] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff659153897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff65a42cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff65a431a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff65a432dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff6a5ecbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff6aaf12609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff6aacdd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff659153897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff65a42cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff65a431a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff65a432dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7ff6a5ecbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7ff6aaf12609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7ff6aacdd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff659153897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7ff65a0b6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7ff6a5ecbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7ff6aaf12609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7ff6aacdd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=101843, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 603177 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=101844, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=262144, Timeout(ms)=600000) ran for 603178 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=101844, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=262144, Timeout(ms)=600000) ran for 603177 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 3 Rank 0] Timeout at NCCL work: 147, last enqueued NCCL work: 147, last completed NCCL work: 146. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe7a42d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe7a55afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe7a55b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe7a55b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe7f104ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe7f6095609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe7f5e60353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe7a42d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe7a55afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe7a55b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe7a55b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe7f104ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe7f6095609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe7f5e60353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe7a42d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fe7a5239119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fe7f104ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fe7f6095609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fe7f5e60353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=101843, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 603178 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 3 Rank 0] Timeout at NCCL work: 147, last enqueued NCCL work: 147, last completed NCCL work: 146. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff27d62f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff27e908c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff27e90da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff27e90edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff2ca3a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff2cf3ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff2cf1b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff27d62f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff27e908c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff27e90da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff27e90edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff2ca3a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff2cf3ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff2cf1b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff27d62f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7ff27e592119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7ff2ca3a7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7ff2cf3ee609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7ff2cf1b9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=101863, OpType=_ALLGATHER_BASE, NumelIn=262144, NumelOut=8388608, Timeout(ms)=600000) ran for 603157 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 3 Rank 0] Timeout at NCCL work: 147, last enqueued NCCL work: 147, last completed NCCL work: 146. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e5b20b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e5c4e4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e5c4e9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e5c4eadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2ea7f83e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2eacfca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2eacd95353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e5b20b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2e5c4e4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2e5c4e9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2e5c4eadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f2ea7f83e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f2eacfca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f2eacd95353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2e5b20b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f2e5c16e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f2ea7f83e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f2eacfca609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f2eacd95353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=101844, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=262144, Timeout(ms)=600000) ran for 603177 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 3 Rank 0] Timeout at NCCL work: 147, last enqueued NCCL work: 147, last completed NCCL work: 146. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4b5c5ff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4b5d8d8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4b5d8dda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4b5d8dedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4ba9377e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f4bae3be609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f4bae189353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4b5c5ff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4b5d8d8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4b5d8dda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4b5d8dedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f4ba9377e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f4bae3be609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f4bae189353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4b5c5ff897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f4b5d562119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f4ba9377e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f4bae3be609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f4bae189353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=101844, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=262144, Timeout(ms)=600000) ran for 603177 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 3 Rank 0] Timeout at NCCL work: 147, last enqueued NCCL work: 147, last completed NCCL work: 146. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd0470ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd048384c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd048389a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd04838adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd093e23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd098e6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd098c35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd0470ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd048384c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd048389a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd04838adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fd093e23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fd098e6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fd098c35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd0470ab897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fd04800e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fd093e23e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fd098e6a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fd098c35353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 3 Rank 0] Timeout at NCCL work: 147, last enqueued NCCL work: 147, last completed NCCL work: 146. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f707d7d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f707eab0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f707eab5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f707eab6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f70ca54fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f70cf596609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f70cf361353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f707d7d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f707eab0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f707eab5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f707eab6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f70ca54fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f70cf596609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f70cf361353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f707d7d7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f707e73a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f70ca54fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f70cf596609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f70cf361353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 3 Rank 0] Timeout at NCCL work: 147, last enqueued NCCL work: 147, last completed NCCL work: 146. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff564ba8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff565e81c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff565e86a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff565e87dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff5b1920e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff5b6967609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff5b6732353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 3 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=147, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff564ba8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff565e81c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff565e86a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff565e87dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7ff5b1920e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7ff5b6967609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7ff5b6732353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff564ba8897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7ff565b0b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7ff5b1920e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7ff5b6967609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7ff5b6732353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -W0703 10:33:04.215000 139944327452480 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1939548 closing signal SIGTERM -W0703 10:33:04.215000 139944327452480 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1939549 closing signal SIGTERM -W0703 10:33:04.215000 139944327452480 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1939551 closing signal SIGTERM -W0703 10:33:04.215000 139944327452480 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1939552 closing signal SIGTERM -W0703 10:33:04.215000 139944327452480 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1939553 closing signal SIGTERM -W0703 10:33:04.215000 139944327452480 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1939554 closing signal SIGTERM -W0703 10:33:04.216000 139944327452480 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1939555 closing signal SIGTERM -E0703 10:33:06.945000 139944327452480 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 1939550) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:33:04 - host : ip-26-0-168-238.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 1939550) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1939550 -============================================================ -srun: error: ip-26-0-168-238: task 3: Exited with exit code 1 -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1316] [PG 0 Rank 49] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=0 -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:1153] [PG 0 Rank 49] ProcessGroupNCCL preparing to dump debug info. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 23] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 23] ProcessGroupNCCL preparing to dump debug info. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 0] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 0] ProcessGroupNCCL preparing to dump debug info. -[default0]:[rank0]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 0] [PG 2 Rank 0] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 22] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 22] ProcessGroupNCCL preparing to dump debug info. -[default6]:[rank22]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 22] [PG 2 Rank 22] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 20] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=114 -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 20] ProcessGroupNCCL preparing to dump debug info. -[default4]:[rank20]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 20] [PG 2 Rank 20] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 114 -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 21] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=97 -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 21] ProcessGroupNCCL preparing to dump debug info. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 16] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 16] ProcessGroupNCCL preparing to dump debug info. -[default5]:[rank21]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 21] [PG 2 Rank 21] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 97 -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 19] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=101 -[default0]:[rank16]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 16] [PG 2 Rank 16] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 18] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=96 -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 19] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 18] ProcessGroupNCCL preparing to dump debug info. -[default3]:[rank19]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 19] [PG 2 Rank 19] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 101 -[default2]:[rank18]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 18] [PG 2 Rank 18] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 96 -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 11] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 11] ProcessGroupNCCL preparing to dump debug info. -[default3]:[rank11]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 11] [PG 2 Rank 11] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 15] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 15] ProcessGroupNCCL preparing to dump debug info. -[default7]:[rank15]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 15] [PG 2 Rank 15] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 9] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=78 -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 9] ProcessGroupNCCL preparing to dump debug info. -[default1]:[rank9]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 9] [PG 2 Rank 9] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, plea[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 17] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=104 -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 17] ProcessGroupNCCL preparing to dump debug info. -[default1]:[rank17]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 17] [PG 2 Rank 17] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise attempt to debug the hang. workMetaList_.size() = 78 -se, please attempt to debug the hang. workMetaList_.size() = 104 -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 2] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=129 -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 2] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank2]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 2] [PG 2 Rank 2] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 129 -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 1] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=110 -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 1] ProcessGroupNCCL preparing to dump debug info. -[default1]:[rank1]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 1] [PG 2 Rank 1] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 110 -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 10] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=85 -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 10] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank10]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 10] [PG 2 Rank 10] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 85 -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 14] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 14] ProcessGroupNCCL preparing to dump debug info. -[default6]:[rank14]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 14] [PG 2 Rank 14] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 4] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 4] ProcessGroupNCCL preparing to dump debug info. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 3] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=97 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 7] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default4]:[rank4]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 4] [PG 2 Rank 4] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 7] ProcessGroupNCCL preparing to dump debug info. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 3] ProcessGroupNCCL preparing to dump debug info. -[default7]:[rank7]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 7] [PG 2 Rank 7] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default3]:[rank3]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 3] [PG 2 Rank 3] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 97 -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 12] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=101 -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 12] ProcessGroupNCCL preparing to dump debug info. -[default4]:[rank12]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 12] [PG 2 Rank 12] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 101 -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 8] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=104 -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 8] ProcessGroupNCCL preparing to dump debug info. -[default0]:[rank8]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 8] [PG 2 Rank 8] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 104 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 6] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 6] ProcessGroupNCCL preparing to dump debug info. -[default6]:[rank6]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 6] [PG 2 Rank 6] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 5] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=78 -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 5] ProcessGroupNCCL preparing to dump debug info. -[default5]:[rank5]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 5] [PG 2 Rank 5] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 78 -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 13] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=98 -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 13] ProcessGroupNCCL preparing to dump debug info. -[default5]:[rank13]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 13] [PG 2 Rank 13] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -[default7]:[rank23]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 23] [PG 2 Rank 23] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 98 -E0703 10:41:04.838000 139815041251136 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1219460) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:41:04 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1219461) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1219461 -[2]: - time : 2024-07-03_10:41:04 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1219462) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1219462 -[3]: - time : 2024-07-03_10:41:04 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1219463) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1219463 -[4]: - time : 2024-07-03_10:41:04 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1219464) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1219464 -[5]: - time : 2024-07-03_10:41:04 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1219465) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1219465 -[6]: - time : 2024-07-03_10:41:04 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1219466) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1219466 -[7]: - time : 2024-07-03_10:41:04 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1219467) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1219467 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:41:04 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1219460) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1219460 -============================================================ -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -W0703 10:41:08.311000 139878855333632 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1140877_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:09.062000 140276852565760 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-220.ec2.internal_850116_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:09.102000 140142188439296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3296700_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:09.627000 140147849172800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3296770 closing signal SIGTERM -W0703 10:41:09.627000 140147849172800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3296772 closing signal SIGTERM -W0703 10:41:09.628000 140147849172800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3296773 closing signal SIGTERM -W0703 10:41:09.628000 140147849172800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3296774 closing signal SIGTERM -W0703 10:41:09.692000 139884516067136 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1140946 closing signal SIGTERM -W0703 10:41:09.693000 139884516067136 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1140947 closing signal SIGTERM -W0703 10:41:09.693000 139884516067136 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1140948 closing signal SIGTERM -W0703 10:41:09.694000 139884516067136 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1140949 closing signal SIGTERM -W0703 10:41:09.695000 139884516067136 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1140950 closing signal SIGTERM -W0703 10:41:09.695000 139884516067136 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1140951 closing signal SIGTERM -W0703 10:41:09.695000 139884516067136 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1140952 closing signal SIGTERM -W0703 10:41:09.697000 139884516067136 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1140953 closing signal SIGTERM -W0703 10:41:09.705000 140282513299264 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 850185 closing signal SIGTERM -W0703 10:41:09.705000 140282513299264 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 850187 closing signal SIGTERM -W0703 10:41:09.705000 140282513299264 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 850189 closing signal SIGTERM -W0703 10:41:09.706000 140282513299264 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 850191 closing signal SIGTERM -E0703 10:41:10.374000 140147849172800 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3296769) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:41:10.388000 140147849172800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3296700_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:10.419000 140147849172800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3296700_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:10.444000 140147849172800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3296700_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:41:09 - host : ip-26-0-163-226.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 3296771) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3296771 -[2]: - time : 2024-07-03_10:41:09 - host : ip-26-0-163-226.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 3296775) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3296775 -[3]: - time : 2024-07-03_10:41:09 - host : ip-26-0-163-226.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 3296776) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3296776 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:41:09 - host : ip-26-0-163-226.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 3296769) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3296769 -============================================================ -E0703 10:41:10.458000 140282513299264 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 850186) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 10:41:10.472000 140282513299264 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_850116_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:10.512000 140282513299264 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_850116_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:10.529000 140282513299264 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_850116_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_10:41:09 - host : ip-26-0-163-220.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 850188) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 850188 -[2]: - time : 2024-07-03_10:41:09 - host : ip-26-0-163-220.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 850190) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 850190 -[3]: - time : 2024-07-03_10:41:09 - host : ip-26-0-163-220.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 850192) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 850192 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_10:41:09 - host : ip-26-0-163-220.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 850186) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 850186 -============================================================ -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 1: Exited with exit code 1 -W0703 10:41:13.316000 139878855333632 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1140877_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:18.320000 139878855333632 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1140877_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:22.933000 139884516067136 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1140877_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 10:41:22.949000 139884516067136 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1140877_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/status.txt b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-1/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/bench.slurm deleted file mode 100644 index 31b9028b7410034938ad1287b079b4e8f0f2524e..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/config.yaml deleted file mode 100644 index 528702afa9ba5a6c17b3b6d391d4772342708eb4..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 4 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 128 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/log.out deleted file mode 100644 index 2b22e17b15b8f2dde23af097b506ce4a29d7a7cd..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/log.out +++ /dev/null @@ -1,4333 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:06:35 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:06:37.577000 140621017061184 torch/distributed/run.py:757] -W0703 09:06:37.577000 140621017061184 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.577000 140621017061184 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:06:37.577000 140621017061184 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.579000 140458706052928 torch/distributed/run.py:757] -W0703 09:06:37.579000 140458706052928 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.579000 140458706052928 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:06:37.579000 140458706052928 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.581000 140469501794112 torch/distributed/run.py:757] -W0703 09:06:37.581000 140469501794112 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.581000 140469501794112 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:06:37.581000 140469501794112 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.580000 140301802088256 torch/distributed/run.py:757] -W0703 09:06:37.580000 140301802088256 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.580000 140301802088256 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:06:37.580000 140301802088256 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.580000 140679352141632 torch/distributed/run.py:757] -W0703 09:06:37.580000 140679352141632 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.580000 140679352141632 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:06:37.580000 140679352141632 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.585000 140701615462208 torch/distributed/run.py:757] -W0703 09:06:37.585000 140701615462208 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.585000 140701615462208 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:06:37.585000 140701615462208 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.600000 139889410742080 torch/distributed/run.py:757] -W0703 09:06:37.600000 139889410742080 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.600000 139889410742080 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:06:37.600000 139889410742080 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.606000 140324045829952 torch/distributed/run.py:757] -W0703 09:06:37.606000 140324045829952 torch/distributed/run.py:757] ***************************************** -W0703 09:06:37.606000 140324045829952 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:06:37.606000 140324045829952 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:06:57 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=32, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=128, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=4, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128')), -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 09:06:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default6]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=30|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=24|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=17|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=20|ip-26-0-172-57]: No checkpoint path provided. -[default5]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=21|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=18|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=19|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=0|ip-26-0-169-132]: No checkpoint path provided. -[default7]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=7|ip-26-0-169-132]: No checkpoint path provided. -[default5]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=5|ip-26-0-169-132]: No checkpoint path provided. -[default0]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=16|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=23|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=22|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=26|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=6|ip-26-0-169-132]: No checkpoint path provided. -[default2]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=2|ip-26-0-169-132]: No checkpoint path provided. -[default1]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=25|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=29|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=1|ip-26-0-169-132]: No checkpoint path provided. -[default3]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=27|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=28|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=31|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=4|ip-26-0-169-132]: No checkpoint path provided. -[default3]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=3|ip-26-0-169-132]: No checkpoint path provided. -[default7]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 09:07:15 [INFO|DP=1|PP=0|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=27|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=27|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=26|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=25|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=25|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=25|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=27|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=31|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=31|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=31|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=26|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=26|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=29|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=29|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=29|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=28|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=28|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=28|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=24|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=24|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=24|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-220]: No checkpoint path provided. -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=30|ip-26-0-168-238]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=30|ip-26-0-168-238]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-220]: No checkpoint path provided. -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=30|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:07:15 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 09:07:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:07:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:07:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 09:07:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 09:07:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:07:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 09:07:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:07:18 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:07:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:07:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:07:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 09:07:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 09:07:19.524406 | mbs: 128 | grad_accum: 4 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:07:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:07:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default0]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=24|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=20|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=31|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=28|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=22|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=21|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=29|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=27|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=30|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=24|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=17|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=7|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=5|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=16|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=19|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=26|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=6|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=25|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=2|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=3|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=18|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=25|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=21|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=22|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=30|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=29|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=1|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=4|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=16|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=28|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:07:20 [WARNING|DP=0|PP=0|TP=31|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:07:19 [WARNING|DP=0|PP=0|TP=26|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=0|ip-26-0-169-132]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=23|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=27|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:07:19 [WARNING|DP=1|PP=0|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:07:20 [WARNING|DP=1|PP=0|TP=18|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:07:20 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:07:20 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:07:20 [WARNING|DP=1|PP=0|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:07:20 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:07:20 [WARNING|DP=1|PP=0|TP=19|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:07:24 [WARNING|DP=0|PP=0|TP=23|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:07:24 [WARNING|DP=0|PP=0|TP=17|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:07:24 [WARNING|DP=0|PP=0|TP=20|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: output = self.pp_block(**new_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank53]: output = self.pp_block(**new_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank52]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank52]: output = self.o_proj(attention_output) -[default5]:[rank53]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank53]: output = self.o_proj(attention_output) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank53]: return row_linear( -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank53]: out = F.linear(input, weight, bias) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank52]: return row_linear( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank52]: out = F.linear(input, weight, bias) -[default5]:[rank53]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: Traceback (most recent call last): -[default6]:[rank54]: Traceback (most recent call last): -[default7]:[rank55]: trainer.train(dataloader) -[default2]:[rank50]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default7]:[rank55]: output = model(**micro_batch) -[default2]:[rank50]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl - forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank59]: output = self.pp_block(**new_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, *[default6]:[rank54]: return self._call_impl(*args, **kwargs) -*kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default3]:[rank59]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank59]: output = self.o_proj(attention_output) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: [default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default7]:[rank55]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank59]: return row_linear( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank59]: out = F.linear(input, weight, bias) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.38 GiB is free. Including non-PyTorch memory, this process has 77.94 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/py[default7]:[rank55]: return forward_call(*args, **kwargs) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -thon3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank56]: output = self.pp_block(**new_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank48]: trainer.train(dataloader) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default0]:[rank56]: output = self.o_proj(attention_output) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank56]: return row_linear( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: out = F.linear(input, weight, bias) -[default0]:[rank56]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank62]: output = self.pp_block(**new_kwargs) -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank62]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank62]: output = self.o_proj(attention_output) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: output = self.pp_block(**new_kwargs) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank62]: return row_linear( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank62]: out = F.linear(input, weight, bias) -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.04 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank63]: output = self.pp_block(**new_kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank63]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank63]: output = self.o_proj(attention_output) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank63]: return row_linear( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank63]: out = F.linear(input, weight, bias) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default7]:[rank63]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.38 GiB is free. Including non-PyTorch memory, this process has 77.94 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: output = model(**micro_batch) -[default7]:[rank55]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: output = self.o_proj(attention_output) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default6]:[rank54]: output = self.pp_block(**new_kwargs) -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: return row_linear( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank49]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: out = F.linear(input, weight, bias) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: output = model(**micro_batch) -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: Traceback (most recent call last): -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default0]:[rank32]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank54]: output = self.o_proj(attention_output) -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank33]: sharded_logits = self.model( -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: output = self.pp_block(**new_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: output = self.o_proj(attention_output) -[default6]:[rank38]: Traceback (most recent call last): -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: return row_linear( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: trainer.train(dataloader) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank36]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: out = F.linear(input, weight, bias) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank50]: output = self.o_proj(attention_output) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank57]: output = model(**micro_batch) -[default3]:[rank51]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: output = self.pp_block(**new_kwargs) -[default0]:[rank48]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default5]:[rank37]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return row_linear( -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank50]: return row_linear( -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank49]: out = F.linear(input, weight, bias) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default2]:[rank50]: out = F.linear(input, weight, bias) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank49]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but un[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -allocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: output = model(**micro_batch) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default0]:[rank48]: output = self.o_proj(attention_output) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank57]: output = self.pp_block(**new_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default3]:[rank51]: output = self.pp_block(**new_kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/benc[default3]:[rank51]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -h_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank40]: output = self.pp_block(**new_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call[default0]:[rank48]: return row_linear( -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank40]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank40]: output = self.o_proj(attention_output) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/module[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -s/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank40]: return row_linear( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank40]: out = F.linear(input, weight, bias) -[default0]:[rank40]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default3]:[rank51]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: out = F.linear(input, weight, bias) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank48]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default2]:[rank58]: output = self.o_proj(attention_output) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank51]: output = self.o_proj(attention_output) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: output = self.o_proj(attention_output) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank51]: return row_linear( -[default4]:[rank36]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank51]: out = F.linear(input, weight, bias) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default3]:[rank51]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank57]: return row_linear( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank57]: out = F.linear(input, weight, bias) -[default2]:[rank58]: return row_linear( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank58]: out = F.linear(input, weight, bias) -[default1]:[rank57]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.38 GiB is free. Including non-PyTorch memory, this process has 77.94 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank58]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.04 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: output = self.pp_block(**new_kwargs) -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank33]: output = self.pp_block(**new_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank37]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default0]:[rank32]: output = self.pp_block(**new_kwargs) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: output = self.o_proj(attention_output) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return row_linear( -[default6]:[rank38]: output = self.pp_block(**new_kwargs) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: out = F.linear(input, weight, bias) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank32]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: output = self.o_proj(attention_output) -[default4]:[rank36]: output = self.pp_block(**new_kwargs) -[default5]:[rank37]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 583.94 MiB is free. Including non-PyTorch memory, this process has 78.75 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: trainer.train(dataloader) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default6]:[rank38]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: return row_linear( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank32]: output = self.o_proj(attention_output) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: out = F.linear(input, weight, bias) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank38]: output = self.o_proj(attention_output) -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 583.94 MiB is free. Including non-PyTorch memory, this process has 78.75 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = self.o_proj(attention_output) -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank38]: return row_linear( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank32]: return row_linear( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: out = F.linear(input, weight, bias) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank39]: output = model(**micro_batch) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default6]:[rank38]: out = F.linear(input, weight, bias) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank36]: return row_linear( -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank36]: out = F.linear(input, weight, bias) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default0]:[rank32]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default4]:[rank36]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 771.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default6]:[rank38]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 771.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank39]: output = self.pp_block(**new_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank39]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank39]: output = self.o_proj(attention_output) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank39]: return row_linear( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank39]: out = F.linear(input, weight, bias) -[default7]:[rank39]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 583.94 MiB is free. Including non-PyTorch memory, this process has 78.75 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank35]: output = self.pp_block(**new_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank35]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank35]: output = self.o_proj(attention_output) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank35]: return row_linear( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank35]: out = F.linear(input, weight, bias) -[default3]:[rank35]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 583.94 MiB is free. Including non-PyTorch memory, this process has 78.75 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank60]: output = self.pp_block(**new_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank60]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank60]: output = self.o_proj(attention_output) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank60]: return row_linear( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank60]: out = F.linear(input, weight, bias) -[default4]:[rank60]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.04 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank34]: output = self.pp_block(**new_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank34]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank34]: output = self.o_proj(attention_output) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank34]: return row_linear( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank34]: out = F.linear(input, weight, bias) -[default2]:[rank34]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 771.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank61]: output = self.pp_block(**new_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank61]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank61]: output = self.o_proj(attention_output) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank61]: return row_linear( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank61]: out = F.linear(input, weight, bias) -[default5]:[rank61]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.38 GiB is free. Including non-PyTorch memory, this process has 77.94 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: Traceback (most recent call last): -[default2]:[rank42]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default7]:[rank47]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: trainer.train(dataloader) -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default7]:[rank47]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default2]:[rank42]: sharded_logits = self.model( -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: sharded_logits = self.model( -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: output = self.pp_block(**new_kwargs) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: output = self.pp_block(**new_kwargs) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: output = self.pp_block(**new_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default1]:[rank41]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank42]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: output = self.pp_block(**new_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: output = self.o_proj(attention_output) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: output = self.o_proj(attention_output) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default2]:[rank42]: output = self.o_proj(attention_output) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: return row_linear( -[default7]:[rank47]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default4]:[rank44]: out = F.linear(input, weight, bias) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank44]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank42]: return row_linear( -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: out = F.linear(input, weight, bias) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default1]:[rank41]: return row_linear( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank42]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: output = self.o_proj(attention_output) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank47]: return row_linear( -[default1]:[rank41]: out = F.linear(input, weight, bias) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank47]: out = F.linear(input, weight, bias) -[default1]:[rank41]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank46]: Traceback (most recent call last): -[default5]:[rank45]: Traceback (most recent call last): -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank45]: output = self.pp_block(**new_kwargs) -[default6]:[rank46]: output = self.pp_block(**new_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank46]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank45]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: output = self.pp_block(**new_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank46]: output = self.o_proj(attention_output) -[default5]:[rank45]: output = self.o_proj(attention_output) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank43]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: return row_linear( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default5]:[rank45]: return row_linear( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank45]: out = F.linear(input, weight, bias) -[default6]:[rank46]: out = F.linear(input, weight, bias) -[default3]:[rank43]: output = self.o_proj(attention_output) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank43]: return row_linear( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank43]: out = F.linear(input, weight, bias) -[default6]:[rank46]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank43]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: Traceback (most recent call last): -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 583.94 MiB is free. Including non-PyTorch memory, this process has 78.75 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank25]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 771.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: Traceback (most recent call last): -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank24]: output = self.o_proj(attention_output) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return row_linear( -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default5]:[rank29]: output = self.o_proj(attention_output) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return row_linear( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 771.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: output = self.o_proj(attention_output) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank8]: output = self.o_proj(attention_output) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank28]: output = self.o_proj(attention_output) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: return row_linear( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 583.94 MiB is free. Including non-PyTorch memory, this process has 78.75 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: Traceback (most recent call last): -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: trainer.train(dataloader) -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: output = model(**micro_batch) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: trainer.train(dataloader) -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: sharded_logits = self.model( -[default5]:[rank13]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: sharded_logits = self.model( -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: output = model(**micro_batch) -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank10]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default3]:[rank11]: output = self.o_proj(attention_output) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: return row_linear( -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: output = self.o_proj(attention_output) -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank18]: output = self.o_proj(attention_output) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default0]:[rank16]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank16]: trainer.train(dataloader) -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: output = model(**micro_batch) -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: sharded_logits = self.model( -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank21]: output = self.o_proj(attention_output) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: Traceback (most recent call last): -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default1]:[rank17]: Traceback (most recent call last): -[default6]:[rank22]: output = self.o_proj(attention_output) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank21]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: output = model(**micro_batch) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: output = self.o_proj(attention_output) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: output = self.o_proj(attention_output) -[default6]:[rank22]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return row_linear( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: return row_linear( -[default1]:[rank17]: output = model(**micro_batch) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 511.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: sharded_logits = self.model( -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank23]: output = self.o_proj(attention_output) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: output = self.o_proj(attention_output) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default7]:[rank23]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default1]:[rank17]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default3]:[rank19]: output = self.o_proj(attention_output) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 403.94 MiB is free. Including non-PyTorch memory, this process has 78.92 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: output = self.o_proj(attention_output) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 771.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank27]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: output = self.o_proj(attention_output) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 771.94 MiB is free. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: sharded_logits = self.model( -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: output = self.o_proj(attention_output) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank30]: output = self.o_proj(attention_output) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 583.94 MiB is free. Including non-PyTorch memory, this process has 78.75 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default4]:[rank4]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.38 GiB is free. Including non-PyTorch memory, this process has 77.94 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.38 GiB is free. Including non-PyTorch memory, this process has 77.94 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: return row_linear( -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.04 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.04 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.04 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank0]: output = self.o_proj(attention_output) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.38 GiB is free. Including non-PyTorch memory, this process has 77.94 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank3]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: return row_linear( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.04 GiB memory in use. Of the allocated memory 68.93 GiB is allocated by PyTorch, and 94.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -W0703 09:07:38.939000 140621017061184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967662 closing signal SIGTERM -W0703 09:07:38.939000 140621017061184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967664 closing signal SIGTERM -W0703 09:07:38.939000 140621017061184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967665 closing signal SIGTERM -W0703 09:07:38.940000 140621017061184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967666 closing signal SIGTERM -W0703 09:07:38.940000 140621017061184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967667 closing signal SIGTERM -W0703 09:07:38.940000 140621017061184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967668 closing signal SIGTERM -W0703 09:07:38.940000 140621017061184 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 967669 closing signal SIGTERM -E0703 09:07:40.662000 140621017061184 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 967663) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:07:38 - host : ip-26-0-172-73.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 967663) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -W0703 09:07:43.951000 139889410742080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2468657 closing signal SIGTERM -W0703 09:07:43.951000 139889410742080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2468659 closing signal SIGTERM -W0703 09:07:43.951000 139889410742080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2468661 closing signal SIGTERM -W0703 09:07:43.951000 139889410742080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2468662 closing signal SIGTERM -W0703 09:07:43.951000 139889410742080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2468663 closing signal SIGTERM -E0703 09:07:44.069000 140701615462208 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1203835) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:07:43 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1203836) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:07:43 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1203837) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:07:43 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1203838) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:07:43 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1203839) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:07:43 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1203840) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:07:43 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1203841) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:07:43 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1203842) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:07:43 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1203835) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -E0703 09:07:45.368000 139889410742080 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2468656) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:07:45.374000 139889410742080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-132.ec2.internal_2468587_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:45.407000 139889410742080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-132.ec2.internal_2468587_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:45.420000 139889410742080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-132.ec2.internal_2468587_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:07:43 - host : ip-26-0-169-132.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 2468658) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:07:43 - host : ip-26-0-169-132.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 2468660) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:07:43 - host : ip-26-0-169-132.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 2468656) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-169-132: task 5: Exited with exit code 1 -W0703 09:07:47.967000 140318385096448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1924771_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:48.743000 140463841060608 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3282254_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:48.783000 140673691408128 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1897596_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:48.818000 140296141354752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-220.ec2.internal_835606_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:48.907000 140453045319424 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1125970_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:48.950000 140469501794112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3282323 closing signal SIGTERM -E0703 09:07:49.074000 140458706052928 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1126038) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:07:49.073000 140324045829952 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1924841) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:07:49.075000 140301802088256 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 835675) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:07:49.080000 140458706052928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1125970_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:49.081000 140324045829952 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1924771_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:49.080000 140301802088256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_835606_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 09:07:49.081000 140679352141632 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1897665) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:07:49.087000 140679352141632 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1897596_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:49.107000 140324045829952 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1924771_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:49.112000 140458706052928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1125970_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:49.113000 140679352141632 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1897596_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:49.115000 140301802088256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_835606_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:49.135000 140324045829952 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1924771_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:07:48 - host : ip-26-0-168-238.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1924842) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:07:48 - host : ip-26-0-168-238.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1924843) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:07:48 - host : ip-26-0-168-238.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1924844) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:07:48 - host : ip-26-0-168-238.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1924845) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:07:48 - host : ip-26-0-168-238.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1924846) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:07:48 - host : ip-26-0-168-238.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1924847) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:07:48 - host : ip-26-0-168-238.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1924848) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:07:48 - host : ip-26-0-168-238.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1924841) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -W0703 09:07:49.141000 140679352141632 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1897596_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -W0703 09:07:49.145000 140458706052928 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1125970_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:07:48 - host : ip-26-0-169-86.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 1897666) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:07:48 - host : ip-26-0-169-86.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 1897667) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:07:48 - host : ip-26-0-169-86.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 1897668) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:07:48 - host : ip-26-0-169-86.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 1897669) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:07:48 - host : ip-26-0-169-86.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 1897670) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:07:48 - host : ip-26-0-169-86.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 1897671) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:07:48 - host : ip-26-0-169-86.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 1897672) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:07:48 - host : ip-26-0-169-86.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 1897665) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:07:48 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 1126039) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:07:48 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 1126040) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:07:48 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 1126041) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:07:48 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 1126042) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:07:48 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 1126043) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:07:48 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 1126044) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:07:48 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 1126045) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:07:48 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 1126038) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -W0703 09:07:49.152000 140301802088256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-220.ec2.internal_835606_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-220.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 835676) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-220.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 835677) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-220.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 835678) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-220.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 835679) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-220.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 835680) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-220.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 835681) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-220.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 835682) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-220.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 835675) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:07:49.174000 140469501794112 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3282322) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:07:49.180000 140469501794112 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3282254_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:49.206000 140469501794112 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3282254_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:07:49.231000 140469501794112 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3282254_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-226.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 3282324) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-226.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 3282325) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-226.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 3282326) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-226.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 3282327) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-226.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 3282328) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-226.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 3282329) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:07:48 - host : ip-26-0-163-226.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 3282322) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-168-238: task 3: Exited with exit code 1 -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-163-220: task 1: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 4: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/status.txt b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-128/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/bench.slurm deleted file mode 100644 index 5e36d33744f2f4a2410f016e66df16141a095ccb..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/config.yaml deleted file mode 100644 index 13a3aadd1f4bb53b401270a834e955370e0d3fb9..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 32 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 16 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/log.out deleted file mode 100644 index e0bcfdc7f6ca1c9b702bc8bd0cc5b9aef99dafb7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/log.out +++ /dev/null @@ -1,743 +0,0 @@ -======================== -START TIME: Wed Jul 3 05:23:42 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 05:23:47.302000 140552398518080 torch/distributed/run.py:757] -W0703 05:23:47.302000 140552398518080 torch/distributed/run.py:757] ***************************************** -W0703 05:23:47.302000 140552398518080 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:23:47.302000 140552398518080 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.301000 140444392572736 torch/distributed/run.py:757] -W0703 05:23:48.301000 140444392572736 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.301000 140444392572736 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:23:48.301000 140444392572736 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.476000 140026132907840 torch/distributed/run.py:757] -W0703 05:23:48.476000 140026132907840 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.476000 140026132907840 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:23:48.476000 140026132907840 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.490000 140158699005760 torch/distributed/run.py:757] -W0703 05:23:48.490000 140158699005760 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.490000 140158699005760 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:23:48.490000 140158699005760 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.626000 139739970131776 torch/distributed/run.py:757] -W0703 05:23:48.626000 139739970131776 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.626000 139739970131776 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:23:48.626000 139739970131776 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.627000 140232759547712 torch/distributed/run.py:757] -W0703 05:23:48.627000 140232759547712 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.627000 140232759547712 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:23:48.627000 140232759547712 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.708000 140422856705856 torch/distributed/run.py:757] -W0703 05:23:48.708000 140422856705856 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.708000 140422856705856 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:23:48.708000 140422856705856 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.957000 140108068357952 torch/distributed/run.py:757] -W0703 05:23:48.957000 140108068357952 torch/distributed/run.py:757] ***************************************** -W0703 05:23:48.957000 140108068357952 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 05:23:48.957000 140108068357952 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 05:24:14 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=32, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=16, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=32, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16')), -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 05:24:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default6]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=22|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=5|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=4|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=16|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=6|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=7|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=17|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=21|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=20|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=18|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=24|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=19|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=23|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=31|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=26|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=28|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=30|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=29|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=25|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 05:24:32 [INFO|DP=1|PP=0|TP=27|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-153]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-153]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=24|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-153]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-153]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=30|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-153]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-153]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=25|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-153]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-153]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=26|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-153]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-153]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=31|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-153]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-153]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=29|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-153]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-153]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=28|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-153]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-153]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=27|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 05:24:33 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 05:24:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 05:24:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 05:24:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 05:24:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 05:24:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 05:24:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 05:24:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 05:24:36 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:24:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 05:24:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 05:24:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 05:24:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 05:24:38.100722 | mbs: 16 | grad_accum: 32 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 05:24:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 05:24:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=4|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=16|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=25|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=22|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=19|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=21|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=16|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=17|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=21|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=31|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=31|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=28|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=19|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=27|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=26|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=30|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=20|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=23|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=5|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=6|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=24|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=26|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=17|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=20|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=24|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=29|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:24:38 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=29|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=22|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=18|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=28|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=27|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 05:24:38 [WARNING|DP=1|PP=0|TP=25|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 05:24:39 [WARNING|DP=0|PP=0|TP=30|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:24:39 [WARNING|DP=1|PP=0|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 05:24:39 [WARNING|DP=1|PP=0|TP=23|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 05:24:43 [WARNING|DP=0|PP=0|TP=18|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: return Variable._execution_engine.run_backward( # Calls int[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -o the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default0]:07/03/2024 05:25:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 366.47MiB. Peak allocated 16977.77MiB. Peak reserved: 17716.00MiB -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:07/03/2024 05:25:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 49.6K | tokens_per_sec: 84.5K | tokens_per_sec_per_gpu: 1.32K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 0.0001 | model_tflops_per_gpu: 12 | hardware_tflops_per_gpu: 12 | grad_norm: 11.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 18.6G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.5G | hd_free_memory_tb: 246G -[default0]:07/03/2024 05:25:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 665.02MiB. Peak reserved: 17768.00MiB -[default0]:07/03/2024 05:25:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.22MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:26:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 9.53e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 11.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 18.7G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.5G | hd_free_memory_tb: 246G -[default0]:07/03/2024 05:26:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 665.04MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:26:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.22MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:26:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.05e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 76 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 18.7G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.5G | hd_free_memory_tb: 246G -[default0]:07/03/2024 05:26:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 665.04MiB. Peak reserved: 17836.00MiB -[default0]:STAGE:2024-07-03 05:26:44 36399:36399 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 05:26:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.22MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:27:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 11.5 | lr: 8.58e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 15.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 18.7G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.5G | hd_free_memory_tb: 246G -[default0]:07/03/2024 05:27:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 665.04MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:28:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 8.11e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 24.3 -[default0]:07/03/2024 05:28:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:28:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 10.1 | lr: 7.63e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 12.8 -[default0]:STAGE:2024-07-03 05:28:57 36399:36399 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 05:28:59 36399:36399 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default0]:07/03/2024 05:31:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:31:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 7 / 20 | consumed_tokens: 29.4M | elapsed_time_per_iteration_ms: 9.36K | tokens_per_sec: 448K | tokens_per_sec_per_gpu: 7K | global_batch_size: 1.02K | lm_loss: 9.9 | lr: 7.16e-05 | model_tflops_per_gpu: 63.5 | hardware_tflops_per_gpu: 63.5 | grad_norm: 9.26 -[default0]:07/03/2024 05:31:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:32:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 8 / 20 | consumed_tokens: 33.6M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 9.56 | lr: 6.68e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 6.93 -[default0]:07/03/2024 05:32:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:32:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 9 / 20 | consumed_tokens: 37.7M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 9.2 | lr: 6.21e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 6.51 -[default0]:07/03/2024 05:32:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:33:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 10 / 20 | consumed_tokens: 41.9M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 8.88 | lr: 5.74e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 5.55 -[default0]:07/03/2024 05:33:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:34:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 11 / 20 | consumed_tokens: 46.1M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 8.69 | lr: 5.26e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 5.87 -[default0]:07/03/2024 05:34:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:34:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 12 / 20 | consumed_tokens: 50.3M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 8.48 | lr: 4.79e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 5.83 -[default0]:07/03/2024 05:34:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:35:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 13 / 20 | consumed_tokens: 54.5M | elapsed_time_per_iteration_ms: 38.3K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 8.25 | lr: 4.32e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 5.08 -[default0]:07/03/2024 05:35:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:35:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 14 / 20 | consumed_tokens: 58.7M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 8.1 | lr: 3.84e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 5.09 -[default0]:07/03/2024 05:35:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:36:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 15 / 20 | consumed_tokens: 62.9M | elapsed_time_per_iteration_ms: 38.3K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 7.99 | lr: 3.37e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 5.12 -[default0]:07/03/2024 05:36:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 16 / 20 | consumed_tokens: 67.1M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 7.9 | lr: 2.89e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 5.13 -[default0]:07/03/2024 05:37:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:37:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 17 / 20 | consumed_tokens: 71.3M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 7.78 | lr: 2.42e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 4.9 -[default0]:07/03/2024 05:37:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:38:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 18 / 20 | consumed_tokens: 75.5M | elapsed_time_per_iteration_ms: 38.3K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 7.67 | lr: 1.95e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 4.65 -[default0]:07/03/2024 05:38:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:39:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 19 / 20 | consumed_tokens: 79.7M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 7.59 | lr: 1.47e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 4.55 -[default0]:07/03/2024 05:39:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 499.20MiB. Peak allocated 17110.51MiB. Peak reserved: 17836.00MiB -[default0]:07/03/2024 05:39:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 20 / 20 | consumed_tokens: 83.9M | elapsed_time_per_iteration_ms: 38.4K | tokens_per_sec: 109K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 7.53 | lr: 1e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 4.51 -W0703 05:40:16.929000 140153038272256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-153.ec2.internal_1461869_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousTimeoutError. -W0703 05:40:16.929000 140417195972352 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-103.ec2.internal_910427_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousTimeoutError. -W0703 05:40:16.931000 140438731839232 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3936035_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousTimeoutError. -W0703 05:40:16.931000 140020472174336 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_1186582_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousTimeoutError. -W0703 05:40:16.929000 140227098814208 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-138.ec2.internal_707398_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousTimeoutError. -W0703 05:40:16.931000 140102407624448 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3806538_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousTimeoutError. -W0703 05:40:17.020000 140444392572736 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3936035_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:40:17.022000 140552398518080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_925625_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:40:17.024000 140444392572736 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3936035_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 05:40:17.027000 140552398518080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_925625_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Saved 1 csv files over 1 completed logs -Processing file: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/profiler/ip-26-0-160-225_36399.1719984647392828901.pt.trace.json -Results written to /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-16/profiler.csv -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-160-225_36399.1719984647392828901.pt.trace.json: 0%| | 0.00/4.47G [00:00 $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/config.yaml deleted file mode 100644 index 2e33b63591d19089c14e72db01c35d793bd7f91a..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 256 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 2 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/log.out deleted file mode 100644 index 5dd6502d6e7b14212aa747879f2c98afb5095c7f..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/log.out +++ /dev/null @@ -1,1621 +0,0 @@ -======================== -START TIME: Wed Jul 3 07:22:52 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 07:22:57.620000 140028362889024 torch/distributed/run.py:757] -W0703 07:22:57.620000 140028362889024 torch/distributed/run.py:757] ***************************************** -W0703 07:22:57.620000 140028362889024 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:22:57.620000 140028362889024 torch/distributed/run.py:757] ***************************************** -W0703 07:22:57.671000 139987244689216 torch/distributed/run.py:757] -W0703 07:22:57.671000 139987244689216 torch/distributed/run.py:757] ***************************************** -W0703 07:22:57.671000 139987244689216 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:22:57.671000 139987244689216 torch/distributed/run.py:757] ***************************************** -W0703 07:22:57.704000 140206149109568 torch/distributed/run.py:757] -W0703 07:22:57.704000 140206149109568 torch/distributed/run.py:757] ***************************************** -W0703 07:22:57.704000 140206149109568 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:22:57.704000 140206149109568 torch/distributed/run.py:757] ***************************************** -W0703 07:22:57.925000 140512356345664 torch/distributed/run.py:757] -W0703 07:22:57.925000 140512356345664 torch/distributed/run.py:757] ***************************************** -W0703 07:22:57.925000 140512356345664 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:22:57.925000 140512356345664 torch/distributed/run.py:757] ***************************************** -W0703 07:22:57.958000 139820889937728 torch/distributed/run.py:757] -W0703 07:22:57.958000 139820889937728 torch/distributed/run.py:757] ***************************************** -W0703 07:22:57.958000 139820889937728 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:22:57.958000 139820889937728 torch/distributed/run.py:757] ***************************************** -W0703 07:22:58.239000 140613056567104 torch/distributed/run.py:757] -W0703 07:22:58.239000 140613056567104 torch/distributed/run.py:757] ***************************************** -W0703 07:22:58.239000 140613056567104 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:22:58.239000 140613056567104 torch/distributed/run.py:757] ***************************************** -W0703 07:22:58.304000 139987419359040 torch/distributed/run.py:757] -W0703 07:22:58.304000 139987419359040 torch/distributed/run.py:757] ***************************************** -W0703 07:22:58.304000 139987419359040 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:22:58.304000 139987419359040 torch/distributed/run.py:757] ***************************************** -W0703 07:22:58.550000 139898246186816 torch/distributed/run.py:757] -W0703 07:22:58.550000 139898246186816 torch/distributed/run.py:757] ***************************************** -W0703 07:22:58.550000 139898246186816 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:22:58.550000 139898246186816 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 07:23:22 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config: -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: run='%date_%jobid', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: step=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: consumed_train_samples=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: benchmark_csv_path=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ignore_sanity_checks=True), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp=1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp=32, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp_engine=, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_mode=, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_linear_async_communication=False, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: expert_parallel_size=1), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dtype=torch.bfloat16, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_revision=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_max_length=None), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoint_interval=100000, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: save_initial_state=False, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: resume_checkpoint_path=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: log_level_replica='info', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration_step_info_interval=1), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: train_steps=20, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: micro_batch_size=2, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: batch_accumulation_per_replica=256, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: val_check_interval=-1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_val_batches=0, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_test_batches=0), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta1=0.9, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta2=0.95, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: torch_adam_is_fused=True, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: name='adamW'), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: zero_stage=1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: weight_decay=0.01, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: clip_grad=1.0, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_steps=1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_style='linear', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_style='linear', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_steps=19, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_starting_step=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: min_decay_lr=1e-05)), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: start_training_step=1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_splits='train', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_config_name=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_overwrite_cache=False, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: text_column_name='text'), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_loading_workers=0))], -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2')), -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lighteval=None) -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Model Config: -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272) -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Building model.. -[default0]:07/03/2024 07:23:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Setting PP block ranks... -[default6]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=30|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=13|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=10|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=2|ip-26-0-173-202]: No checkpoint path provided. -[default1]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=9|ip-26-0-173-246]: No checkpoint path provided. -[default1]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=1|ip-26-0-173-202]: No checkpoint path provided. -[default6]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=6|ip-26-0-173-202]: No checkpoint path provided. -[default0]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=0|ip-26-0-173-202]: No checkpoint path provided. -[default0]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=8|ip-26-0-173-246]: No checkpoint path provided. -[default3]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=3|ip-26-0-173-202]: No checkpoint path provided. -[default4]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=4|ip-26-0-173-202]: No checkpoint path provided. -[default5]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=5|ip-26-0-173-202]: No checkpoint path provided. -[default1]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=25|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=21|ip-26-0-173-7]: No checkpoint path provided. -[default2]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=18|ip-26-0-173-7]: No checkpoint path provided. -[default3]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=19|ip-26-0-173-7]: No checkpoint path provided. -[default6]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=22|ip-26-0-173-7]: No checkpoint path provided. -[default0]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=16|ip-26-0-173-7]: No checkpoint path provided. -[default3]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=11|ip-26-0-173-246]: No checkpoint path provided. -[default4]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=20|ip-26-0-173-7]: No checkpoint path provided. -[default7]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=15|ip-26-0-173-246]: No checkpoint path provided. -[default4]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=12|ip-26-0-173-246]: No checkpoint path provided. -[default1]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=17|ip-26-0-173-7]: No checkpoint path provided. -[default5]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=29|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=14|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=26|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=24|ip-26-0-174-36]: No checkpoint path provided. -[default7]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=31|ip-26-0-174-36]: No checkpoint path provided. -[default4]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=28|ip-26-0-174-36]: No checkpoint path provided. -[default3]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=27|ip-26-0-174-36]: No checkpoint path provided. -[default7]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=23|ip-26-0-173-7]: No checkpoint path provided. -[default7]:07/03/2024 07:23:41 [INFO|DP=1|PP=0|TP=7|ip-26-0-173-202]: No checkpoint path provided. -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Parametrizing model parameters using StandardParametrizator -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: No checkpoint path provided. -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=27|ip-26-0-165-24]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=27|ip-26-0-165-24]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=27|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=29|ip-26-0-165-24]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=25|ip-26-0-165-24]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=25|ip-26-0-165-24]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=24|ip-26-0-165-24]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=24|ip-26-0-165-24]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=24|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=29|ip-26-0-165-24]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=29|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=31|ip-26-0-165-24]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=31|ip-26-0-165-24]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=28|ip-26-0-165-24]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=25|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=28|ip-26-0-165-24]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=28|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=30|ip-26-0-165-24]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=31|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=30|ip-26-0-165-24]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=30|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=26|ip-26-0-165-24]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=26|ip-26-0-165-24]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=26|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=19|ip-26-0-164-207]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=19|ip-26-0-164-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=19|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=18|ip-26-0-164-207]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=18|ip-26-0-164-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=21|ip-26-0-164-207]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=21|ip-26-0-164-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=21|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=18|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=22|ip-26-0-164-207]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=22|ip-26-0-164-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=22|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=16|ip-26-0-164-207]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=16|ip-26-0-164-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=16|ip-26-0-164-207]: No checkpoint path provided. -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=17|ip-26-0-164-207]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=17|ip-26-0-164-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=17|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=23|ip-26-0-164-207]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=23|ip-26-0-164-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=23|ip-26-0-164-207]: No checkpoint path provided. -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=20|ip-26-0-164-207]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=20|ip-26-0-164-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 07:23:42 [INFO|DP=0|PP=0|TP=20|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 07:23:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 07:23:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 07:23:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 07:23:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 07:23:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 07:23:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Using `datasets` library -[default0]:07/03/2024 07:23:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 07:23:45 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:23:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 07:23:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 07:23:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: -[default0]:07/03/2024 07:23:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Start training] datetime: 2024-07-03 07:23:47.282814 | mbs: 2 | grad_accum: 256 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 07:23:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 07:23:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default0]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=9|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=2|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=6|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=4|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=3|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=8|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=5|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=27|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=29|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=25|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=30|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=28|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=24|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=26|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=19|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=22|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=18|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=11|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=14|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=29|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=16|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=20|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=28|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=26|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=31|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=27|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=19|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=18|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=7|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=22|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=16|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=17|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=20|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=30|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=10|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=13|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=0|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=1|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=21|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=25|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=31|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=17|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=12|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=15|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=24|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:23:47 [WARNING|DP=1|PP=0|TP=23|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=21|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:23:47 [WARNING|DP=0|PP=0|TP=23|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]: warnings.warn( -[default0]:07/03/2024 07:24:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 365.60MiB. Peak allocated 2588.00MiB. Peak reserved: 2728.00MiB -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default3]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:07/03/2024 07:24:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 53.8K | tokens_per_sec: 78K | tokens_per_sec_per_gpu: 1.22K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 0.0001 | model_tflops_per_gpu: 11.1 | hardware_tflops_per_gpu: 11.1 | grad_norm: 11.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 2.88G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.1G | hd_free_memory_tb: 246G -[default0]:07/03/2024 07:24:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 498.33MiB. Peak allocated 664.15MiB. Peak reserved: 2746.00MiB -[default0]:07/03/2024 07:25:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 498.45MiB. Peak allocated 2597.89MiB. Peak reserved: 2848.00MiB -[default0]:07/03/2024 07:25:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 38.1K | tokens_per_sec: 110K | tokens_per_sec_per_gpu: 1.72K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 9.53e-05 | model_tflops_per_gpu: 15.6 | hardware_tflops_per_gpu: 15.6 | grad_norm: 11.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 2.99G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.1G | hd_free_memory_tb: 246G -[default0]:07/03/2024 07:25:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 498.33MiB. Peak allocated 664.27MiB. Peak reserved: 2848.00MiB -[default0]:07/03/2024 07:26:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 498.45MiB. Peak allocated 2597.89MiB. Peak reserved: 2848.00MiB -[default0]:07/03/2024 07:26:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 42.4K | tokens_per_sec: 99K | tokens_per_sec_per_gpu: 1.55K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.05e-05 | model_tflops_per_gpu: 14 | hardware_tflops_per_gpu: 14 | grad_norm: 76 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 2.99G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.1G | hd_free_memory_tb: 246G -[default0]:07/03/2024 07:26:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 498.33MiB. Peak allocated 664.27MiB. Peak reserved: 2848.00MiB -[default0]:STAGE:2024-07-03 07:26:01 1706065:1706065 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 07:26:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 498.45MiB. Peak allocated 2597.89MiB. Peak reserved: 2848.00MiB -[default0]:07/03/2024 07:26:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 55.4K | tokens_per_sec: 75.7K | tokens_per_sec_per_gpu: 1.18K | global_batch_size: 1.02K | lm_loss: 11.5 | lr: 8.58e-05 | model_tflops_per_gpu: 10.7 | hardware_tflops_per_gpu: 10.7 | grad_norm: 15.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 2.99G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.1G | hd_free_memory_tb: 246G -[default0]:07/03/2024 07:26:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 498.33MiB. Peak allocated 664.27MiB. Peak reserved: 2848.00MiB -[default0]:07/03/2024 07:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 56.3K | tokens_per_sec: 74.5K | tokens_per_sec_per_gpu: 1.16K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 8.11e-05 | model_tflops_per_gpu: 10.6 | hardware_tflops_per_gpu: 10.6 | grad_norm: 24.3 -[default0]:07/03/2024 07:27:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 498.33MiB. Peak allocated 2597.89MiB. Peak reserved: 2848.00MiB -[default0]:07/03/2024 07:28:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 56.1K | tokens_per_sec: 74.7K | tokens_per_sec_per_gpu: 1.17K | global_batch_size: 1.02K | lm_loss: 10.1 | lr: 7.63e-05 | model_tflops_per_gpu: 10.6 | hardware_tflops_per_gpu: 10.6 | grad_norm: 12.8 -[default0]:STAGE:2024-07-03 07:31:24 1706065:1706065 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 07:31:40 1706065:1706065 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600091 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600087 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600096 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600668 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600014 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356647, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 601814 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356671, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 601989 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356627, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 602313 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356624, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 602504 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356645, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 602492 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356582, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 602504 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356647, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 602517 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356644, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 602550 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356642, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 602754 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356667, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 602837 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356628, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 602878 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356667, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 602803 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356671, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 602903 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356560, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 602992 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356572, OpType=ALLREDUCE, NumelIn=8192, NumelOut=8192, Timeout(ms)=600000) ran for 603014 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356671, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 603028 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 602988 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356663, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 602997 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356667, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 603041 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356645, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=16777216, Timeout(ms)=600000) ran for 603029 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356551, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 603087 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356636, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 603050 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 603200 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356559, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 603276 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1316] [PG 0 Rank 8] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=0 -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1153] [PG 0 Rank 8] ProcessGroupNCCL preparing to dump debug info. -[default0]:[rank8]:[F ProcessGroupNCCL.cpp:1169] [PG 0 Rank 8] [PG 0 Rank 8] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 0 -W0703 07:43:20.729000 140512356345664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 838214 closing signal SIGTERM -W0703 07:43:20.731000 140512356345664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 838215 closing signal SIGTERM -W0703 07:43:20.731000 140512356345664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 838216 closing signal SIGTERM -W0703 07:43:20.731000 140512356345664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 838217 closing signal SIGTERM -W0703 07:43:20.734000 140512356345664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 838218 closing signal SIGTERM -W0703 07:43:20.734000 140512356345664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 838219 closing signal SIGTERM -W0703 07:43:20.734000 140512356345664 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 838220 closing signal SIGTERM -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 19] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=119 -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 19] ProcessGroupNCCL preparing to dump debug info. -[default3]:[rank19]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 19] [PG 2 Rank 19] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 119 -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 18] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=119 -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 18] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank18]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 18] [PG 2 Rank 18] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 119 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 6] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=119 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 6] ProcessGroupNCCL preparing to dump debug info. -[default6]:[rank6]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 6] [PG 2 Rank 6] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 119 -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 2] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=119 -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 29] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=119 -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 2] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank2]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 2] [PG 2 Rank 2] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 119 -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 29] ProcessGroupNCCL preparing to dump debug info. -[default5]:[rank29]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 29] [PG 2 Rank 29] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 119 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 7] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=119 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 7] ProcessGroupNCCL preparing to dump debug info. -[default7]:[rank7]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 7] [PG 2 Rank 7] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 119 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 4] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=119 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 4] ProcessGroupNCCL preparing to dump debug info. -[default4]:[rank4]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 4] [PG 2 Rank 4] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 119 -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 30] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=119 -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 30] ProcessGroupNCCL preparing to dump debug info. -[default6]:[rank30]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 30] [PG 2 Rank 30] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 119 -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 31] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 31] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 31] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 31] Process group watchdog thread terminated with exception: [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1ca0772897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 25] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 25] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1ca1a4bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 25] To avoid data inconsistency, we are taking the entire process down. -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1ca1a50a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1ca1a51dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 25] Process group watchdog thread terminated with exception: [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ae5a03897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4ae6cdcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4ae6ce1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4ae6ce2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f4b3277be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #4: + 0xd3e95 (0x7f1ced4eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f1cf2531609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f1cf22fc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #5: + 0x8609 (0x7f4b377c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]: -[default1]:frame #6: clone + 0x43 (0x7f4b3758d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 31] Process group watchdog thread terminated with exception: [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]: what(): [PG 2 Rank 25] Process group watchdog thread terminated with exception: [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600092 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ae5a03897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4ae6cdcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1ca0772897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1ca1a4bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4ae6ce1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1ca1a50a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1ca1a51dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f1ced4eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4ae6ce2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #5: + 0x8609 (0x7f1cf2531609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: + 0xd3e95 (0x7f4b3277be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #6: clone + 0x43 (0x7f1cf22fc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #5: + 0x8609 (0x7f4b377c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1ca0772897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #6: clone + 0x43 (0x7f4b3758d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:frame #1: + 0xe32119 (0x7f1ca16d5119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f1ced4eae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f1cf2531609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]: -[default7]:frame #4: clone + 0x43 (0x7f1cf22fc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]: -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4ae5a03897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f4ae6966119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f4b3277be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f4b377c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f4b3758d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 26] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 26] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 26] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 26] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34a5741897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f34a6a1ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f34a6a1fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f34a6a20dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f34f24b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f34f7500609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f34f72cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 26] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600061 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34a5741897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f34a6a1ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f34a6a1fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f34a6a20dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f34f24b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f34f7500609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f34f72cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34a5741897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f34a66a4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f34f24b9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f34f7500609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f34f72cb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 28] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 28] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 28] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 28] Process group watchdog thread terminated with exception: [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fecec3e0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feced6b9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feced6bea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feced6bfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fed39158e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fed3e19f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fed3df6a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 28] Process group watchdog thread terminated with exception: [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fecec3e0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7feced6b9c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7feced6bea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7feced6bfdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fed39158e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fed3e19f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fed3df6a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fecec3e0897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7feced343119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fed39158e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fed3e19f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fed3df6a353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 24] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 24] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 24] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 24] Process group watchdog thread terminated with exception: [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f719ac96897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f719bf6fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f719bf74a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f719bf75dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f71e7a0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f71eca55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f71ec820353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 24] Process group watchdog thread terminated with exception: [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f719ac96897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f719bf6fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f719bf74a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f719bf75dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f71e7a0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f71eca55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f71ec820353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f719ac96897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f719bbf9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f71e7a0ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f71eca55609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f71ec820353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 21] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 21] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 22] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 21] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 21] Process group watchdog thread terminated with exception: [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04a626c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 22] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 22] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 22] Process group watchdog thread terminated with exception: [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6ff136897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa70040fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa700414a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa700415dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fa74beaee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fa750ef5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04a7545c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #6: clone + 0x43 (0x7fa750cc0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04a754aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04a754bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f04f2fe4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f04f802b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:frame #6: clone + 0x43 (0x7f04f7df6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: what(): [PG 2 Rank 22] Process group watchdog thread terminated with exception: [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600066 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]: -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6ff136897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa70040fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]: what(): [PG 2 Rank 21] Process group watchdog thread terminated with exception: [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa700414a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa700415dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04a626c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #4: + 0xd3e95 (0x7fa74beaee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04a7545c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04a754aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #5: + 0x8609 (0x7fa750ef5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04a754bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f04f2fe4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #6: clone + 0x43 (0x7fa750cc0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #5: + 0x8609 (0x7f04f802b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]: -[default5]:frame #6: clone + 0x43 (0x7f04f7df6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04a626c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f04a71cf119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #2: + 0xd3e95 (0x7f04f2fe4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f04f802b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f04f7df6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa6ff136897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fa700099119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fa74beaee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fa750ef5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fa750cc0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 17] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 17] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 17] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 17] Process group watchdog thread terminated with exception: [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7a0f8b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7a10b8cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7a10b91a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7a10b92dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7a5c62be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7a61672609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7a6143d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 17] Process group watchdog thread terminated with exception: [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7a0f8b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7a10b8cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7a10b91a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7a10b92dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f7a5c62be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f7a61672609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f7a6143d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7a0f8b3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f7a10816119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f7a5c62be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f7a61672609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f7a6143d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 20] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 20] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 20] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 20] Process group watchdog thread terminated with exception: [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f92e4a43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f92e5d1cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f92e5d21a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f92e5d22dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f93317bbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f9336802609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f93365cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 20] Process group watchdog thread terminated with exception: [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f92e4a43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f92e5d1cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f92e5d21a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f92e5d22dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f93317bbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f9336802609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f93365cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f92e4a43897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f92e59a6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f93317bbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f9336802609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f93365cd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 16] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 16] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 16] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 16] Process group watchdog thread terminated with exception: [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600668 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc057b8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc058e63c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc058e68a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc058e69dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fc0a4902e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fc0a9949609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fc0a9714353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 16] Process group watchdog thread terminated with exception: [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600668 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc057b8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc058e63c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc058e68a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc058e69dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fc0a4902e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fc0a9949609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fc0a9714353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc057b8a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fc058aed119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fc0a4902e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fc0a9949609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fc0a9714353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 27] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 27] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 27] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 27] Process group watchdog thread terminated with exception: [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1fae425897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1faf6fec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1faf703a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1faf704dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1ffb19de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f20001e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1ffffaf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 27] Process group watchdog thread terminated with exception: [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1fae425897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1faf6fec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1faf703a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1faf704dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1ffb19de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f20001e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1ffffaf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1fae425897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f1faf388119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1ffb19de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f20001e4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f1ffffaf353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 23] Timeout at NCCL work: 305728, last enqueued NCCL work: 305846, last completed NCCL work: 305727. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 23] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 23] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 23] Process group watchdog thread terminated with exception: [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34999e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f349acbcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f349acc1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f349acc2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f34e675be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f34eb7a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f34eb56d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 23] Process group watchdog thread terminated with exception: [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=524288, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34999e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f349acbcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f349acc1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f349acc2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f34e675be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f34eb7a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f34eb56d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f34999e3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f349a946119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f34e675be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f34eb7a2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f34eb56d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -W0703 07:43:30.760000 140028362889024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1706065 closing signal SIGTERM -W0703 07:43:30.762000 140028362889024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1706066 closing signal SIGTERM -W0703 07:43:30.762000 140028362889024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1706068 closing signal SIGTERM -W0703 07:43:30.764000 140028362889024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1706070 closing signal SIGTERM -E0703 07:43:31.913000 140512356345664 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 838213) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:43:20 - host : ip-26-0-163-147.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 838213) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 838213 -============================================================ -srun: error: ip-26-0-163-147: task 1: Exited with exit code 1 -W0703 07:43:35.771000 140613056567104 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 450933 closing signal SIGTERM -E0703 07:43:35.818000 139898246186816 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 940358) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_07:43:35 - host : ip-26-0-165-24.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 940359) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 940359 -[2]: - time : 2024-07-03_07:43:35 - host : ip-26-0-165-24.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 940360) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 940360 -[3]: - time : 2024-07-03_07:43:35 - host : ip-26-0-165-24.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 940361) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 940361 -[4]: - time : 2024-07-03_07:43:35 - host : ip-26-0-165-24.ec2.internal - rank : 28 (local_rank: 4) - exitcode : -6 (pid: 940362) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 940362 -[5]: - time : 2024-07-03_07:43:35 - host : ip-26-0-165-24.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 940363) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 940363 -[6]: - time : 2024-07-03_07:43:35 - host : ip-26-0-165-24.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 940364) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 940364 -[7]: - time : 2024-07-03_07:43:35 - host : ip-26-0-165-24.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 940365) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 940365 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:43:35 - host : ip-26-0-165-24.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 940358) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 940358 -============================================================ -E0703 07:43:36.085000 140613056567104 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 450932) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_07:43:35 - host : ip-26-0-164-207.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 450934) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 450934 -[2]: - time : 2024-07-03_07:43:35 - host : ip-26-0-164-207.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 450935) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 450935 -[3]: - time : 2024-07-03_07:43:35 - host : ip-26-0-164-207.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 450936) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 450936 -[4]: - time : 2024-07-03_07:43:35 - host : ip-26-0-164-207.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 450937) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 450937 -[5]: - time : 2024-07-03_07:43:35 - host : ip-26-0-164-207.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 450938) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 450938 -[6]: - time : 2024-07-03_07:43:35 - host : ip-26-0-164-207.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 450939) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 450939 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:43:35 - host : ip-26-0-164-207.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 450932) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 450932 -============================================================ -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -srun: error: ip-26-0-164-207: task 2: Exited with exit code 1 -E0703 07:43:40.124000 140028362889024 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 2 (pid: 1706067) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_07:43:30 - host : ip-26-0-162-233.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1706069) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1706069 -[2]: - time : 2024-07-03_07:43:30 - host : ip-26-0-162-233.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1706071) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1706071 -[3]: - time : 2024-07-03_07:43:30 - host : ip-26-0-162-233.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1706072) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1706072 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_07:43:30 - host : ip-26-0-162-233.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1706067) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1706067 -============================================================ -W0703 07:43:40.305000 139981758625536 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-202.ec2.internal_1348543_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:40.662000 139987244689216 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 881638 closing signal SIGTERM -W0703 07:43:40.662000 139987244689216 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 881639 closing signal SIGTERM -W0703 07:43:40.662000 139987244689216 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 881640 closing signal SIGTERM -W0703 07:43:40.665000 139987244689216 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 881641 closing signal SIGTERM -W0703 07:43:40.665000 139987244689216 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 881642 closing signal SIGTERM -W0703 07:43:40.665000 139987244689216 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 881643 closing signal SIGTERM -W0703 07:43:40.665000 139987244689216 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 881644 closing signal SIGTERM -W0703 07:43:40.665000 139987244689216 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 881645 closing signal SIGTERM -W0703 07:43:40.779000 140206149109568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2072415 closing signal SIGTERM -W0703 07:43:40.780000 140206149109568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2072416 closing signal SIGTERM -W0703 07:43:40.780000 140206149109568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2072417 closing signal SIGTERM -W0703 07:43:40.781000 140206149109568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2072418 closing signal SIGTERM -W0703 07:43:40.781000 140206149109568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2072419 closing signal SIGTERM -W0703 07:43:40.782000 140206149109568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2072420 closing signal SIGTERM -W0703 07:43:40.782000 140206149109568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2072421 closing signal SIGTERM -W0703 07:43:40.782000 140206149109568 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2072422 closing signal SIGTERM -W0703 07:43:40.794000 139987419359040 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1348616 closing signal SIGTERM -W0703 07:43:40.795000 139987419359040 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1348617 closing signal SIGTERM -W0703 07:43:40.795000 139987419359040 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1348618 closing signal SIGTERM -W0703 07:43:40.795000 139987419359040 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1348619 closing signal SIGTERM -W0703 07:43:40.797000 139987419359040 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1348620 closing signal SIGTERM -W0703 07:43:40.797000 139987419359040 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1348621 closing signal SIGTERM -W0703 07:43:40.798000 139987419359040 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1348622 closing signal SIGTERM -W0703 07:43:40.798000 139987419359040 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1348623 closing signal SIGTERM -srun: error: ip-26-0-162-233: task 0: Exited with exit code 1 -W0703 07:43:44.611000 139981583955712 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-174-36.ec2.internal_881564_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:44.644000 140200488376064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-7.ec2.internal_2072342_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:44.888000 139815229204224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-246.ec2.internal_366912_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:45.309000 139981758625536 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-202.ec2.internal_1348543_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:45.777000 139820889937728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 366985 closing signal SIGTERM -W0703 07:43:45.778000 139820889937728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 366986 closing signal SIGTERM -W0703 07:43:45.778000 139820889937728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 366987 closing signal SIGTERM -W0703 07:43:45.779000 139820889937728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 366988 closing signal SIGTERM -W0703 07:43:45.779000 139820889937728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 366989 closing signal SIGTERM -W0703 07:43:45.780000 139820889937728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 366990 closing signal SIGTERM -W0703 07:43:45.780000 139820889937728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 366991 closing signal SIGTERM -W0703 07:43:45.780000 139820889937728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 366992 closing signal SIGTERM -W0703 07:43:48.405000 139987244689216 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_881564_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:48.423000 139987244689216 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-174-36.ec2.internal_881564_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Connection reset by peer - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-174-36: task 7: Exited with exit code 1 -W0703 07:43:49.649000 140200488376064 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-7.ec2.internal_2072342_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:49.892000 139815229204224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-246.ec2.internal_366912_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:50.314000 139981758625536 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-202.ec2.internal_1348543_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:52.270000 139987419359040 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1348543_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:52.291000 139987419359040 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-202.ec2.internal_1348543_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-173-202: task 5: Exited with exit code 1 -W0703 07:43:54.381000 140206149109568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-7.ec2.internal_2072342_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:54.396000 140206149109568 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-7.ec2.internal_2072342_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Connection reset by peer - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 07:43:54.897000 139815229204224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-173-246.ec2.internal_366912_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-173-7: task 4: Exited with exit code 1 -W0703 07:43:58.666000 139820889937728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_366912_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:43:58.682000 139820889937728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-173-246.ec2.internal_366912_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-173-246: task 6: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/status.txt b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/status.txt deleted file mode 100644 index 113c9c5864d23a2e8757471844f1569dacd5b1e7..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-2/status.txt +++ /dev/null @@ -1 +0,0 @@ -timeout \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/bench.slurm deleted file mode 100644 index ac65f31e2b5e280efa0ef4f31198e802c14e4489..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/config.yaml deleted file mode 100644 index e844677d6c89d4b88531b89ee830f59be6abf12f..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 2 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 256 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/log.out deleted file mode 100644 index 03476f59c2c951d086615c601d02255803f2cafe..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/log.out +++ /dev/null @@ -1,4399 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:22:11 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:22:16.789000 140591072921408 torch/distributed/run.py:757] -W0703 09:22:16.789000 140591072921408 torch/distributed/run.py:757] ***************************************** -W0703 09:22:16.789000 140591072921408 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:22:16.789000 140591072921408 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.352000 140092553299776 torch/distributed/run.py:757] -W0703 09:22:17.352000 140092553299776 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.352000 140092553299776 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:22:17.352000 140092553299776 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.374000 139763736954688 torch/distributed/run.py:757] -W0703 09:22:17.374000 139763736954688 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.374000 139763736954688 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:22:17.374000 139763736954688 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.726000 140614064219968 torch/distributed/run.py:757] -W0703 09:22:17.726000 140614064219968 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.726000 140614064219968 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:22:17.726000 140614064219968 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.764000 139989571295040 torch/distributed/run.py:757] -W0703 09:22:17.764000 139989571295040 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.764000 139989571295040 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:22:17.764000 139989571295040 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.899000 139669372913472 torch/distributed/run.py:757] -W0703 09:22:17.899000 139669372913472 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.899000 139669372913472 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:22:17.899000 139669372913472 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.952000 140430005966656 torch/distributed/run.py:757] -W0703 09:22:17.952000 140430005966656 torch/distributed/run.py:757] ***************************************** -W0703 09:22:17.952000 140430005966656 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:22:17.952000 140430005966656 torch/distributed/run.py:757] ***************************************** -W0703 09:22:18.204000 140106230019904 torch/distributed/run.py:757] -W0703 09:22:18.204000 140106230019904 torch/distributed/run.py:757] ***************************************** -W0703 09:22:18.204000 140106230019904 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:22:18.204000 140106230019904 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:22:43 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Config: -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: run='%date_%jobid', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: seed=42, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: step=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: consumed_train_samples=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: benchmark_csv_path=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pp=1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp=32, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pp_engine=, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp_mode=, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: expert_parallel_size=1), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: eos_token_id=2, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_act='silu', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_size=2048, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: initializer_range=0.02, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: intermediate_size=4096, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: is_llama_config=True, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: max_position_embeddings=4096, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_attention_heads=32, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_hidden_layers=24, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_key_value_heads=32, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pad_token_id=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pretraining_tp=1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_scaling=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_theta=10000.0, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tie_word_embeddings=True, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: use_cache=True, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: vocab_size=50272), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer_revision=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokenizer_max_length=None), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoint_interval=100000, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: save_initial_state=False, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: log_level_replica='info', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: train_steps=20, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: micro_batch_size=256, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: batch_accumulation_per_replica=2, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: val_check_interval=-1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: limit_val_batches=0, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: limit_test_batches=0), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: adam_beta1=0.9, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: adam_beta2=0.95, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: name='adamW'), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: zero_stage=1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: weight_decay=0.01, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: clip_grad=1.0, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_warmup_steps=1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_warmup_style='linear', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_style='linear', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_steps=19, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: start_training_step=1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hf_dataset_splits='train', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: text_column_name='text'), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: seed=42, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_loading_workers=0))], -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256')), -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: lighteval=None) -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Model Config: -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: eos_token_id=2, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_act='silu', -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: hidden_size=2048, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: initializer_range=0.02, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: intermediate_size=4096, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: is_llama_config=True, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: max_position_embeddings=4096, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_attention_heads=32, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_hidden_layers=24, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: num_key_value_heads=32, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pad_token_id=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: pretraining_tp=1, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_scaling=None, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: rope_theta=10000.0, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: tie_word_embeddings=True, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: use_cache=True, -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: vocab_size=50272) -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Building model.. -[default0]:07/03/2024 09:22:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Setting PP block ranks... -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=20|ip-26-0-169-239]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=20|ip-26-0-169-239]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=20|ip-26-0-169-239]: No checkpoint path provided. -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=16|ip-26-0-169-239]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=16|ip-26-0-169-239]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=16|ip-26-0-169-239]: No checkpoint path provided. -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: No checkpoint path provided. -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Parametrizing model parameters using StandardParametrizator -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=5|ip-26-0-169-139]: No checkpoint path provided. -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=27|ip-26-0-169-247]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=27|ip-26-0-169-247]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=24|ip-26-0-169-247]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=24|ip-26-0-169-247]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=27|ip-26-0-169-247]: No checkpoint path provided. -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=24|ip-26-0-169-247]: No checkpoint path provided. -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=19|ip-26-0-169-239]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=19|ip-26-0-169-239]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=19|ip-26-0-169-239]: No checkpoint path provided. -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=18|ip-26-0-169-239]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=18|ip-26-0-169-239]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=17|ip-26-0-169-239]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=17|ip-26-0-169-239]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=17|ip-26-0-169-239]: No checkpoint path provided. -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=23|ip-26-0-169-239]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=23|ip-26-0-169-239]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=23|ip-26-0-169-239]: No checkpoint path provided. -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=18|ip-26-0-169-239]: No checkpoint path provided. -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=8|ip-26-0-169-207]: No checkpoint path provided. -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-139]: No checkpoint path provided. -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=2|ip-26-0-169-139]: No checkpoint path provided. -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=6|ip-26-0-169-139]: No checkpoint path provided. -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=7|ip-26-0-169-139]: No checkpoint path provided. -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=3|ip-26-0-169-139]: No checkpoint path provided. -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=25|ip-26-0-169-247]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=25|ip-26-0-169-247]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=25|ip-26-0-169-247]: No checkpoint path provided. -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=4|ip-26-0-169-139]: No checkpoint path provided. -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=21|ip-26-0-169-239]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=22|ip-26-0-169-239]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=29|ip-26-0-169-247]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=22|ip-26-0-169-239]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=21|ip-26-0-169-239]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=21|ip-26-0-169-239]: No checkpoint path provided. -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=22|ip-26-0-169-239]: No checkpoint path provided. -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=29|ip-26-0-169-247]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=30|ip-26-0-169-247]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=30|ip-26-0-169-247]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=29|ip-26-0-169-247]: No checkpoint path provided. -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=31|ip-26-0-169-247]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=31|ip-26-0-169-247]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=31|ip-26-0-169-247]: No checkpoint path provided. -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=30|ip-26-0-169-247]: No checkpoint path provided. -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=14|ip-26-0-169-207]: No checkpoint path provided. -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=9|ip-26-0-169-207]: No checkpoint path provided. -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=11|ip-26-0-169-207]: No checkpoint path provided. -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=10|ip-26-0-169-207]: No checkpoint path provided. -[default5]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=13|ip-26-0-169-207]: No checkpoint path provided. -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=15|ip-26-0-169-207]: No checkpoint path provided. -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=12|ip-26-0-169-207]: No checkpoint path provided. -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=26|ip-26-0-169-247]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=26|ip-26-0-169-247]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=26|ip-26-0-169-247]: No checkpoint path provided. -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=28|ip-26-0-169-247]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=28|ip-26-0-169-247]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:23:02 [INFO|DP=0|PP=0|TP=28|ip-26-0-169-247]: No checkpoint path provided. -[default5]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=13|ip-26-0-171-56]: No checkpoint path provided. -[default7]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=15|ip-26-0-171-56]: No checkpoint path provided. -[default6]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=14|ip-26-0-171-56]: No checkpoint path provided. -[default0]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=8|ip-26-0-171-56]: No checkpoint path provided. -[default2]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=10|ip-26-0-171-56]: No checkpoint path provided. -[default3]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=11|ip-26-0-171-56]: No checkpoint path provided. -[default0]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=0|ip-26-0-170-31]: No checkpoint path provided. -[default1]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=9|ip-26-0-171-56]: No checkpoint path provided. -[default4]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=12|ip-26-0-171-56]: No checkpoint path provided. -[default5]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=21|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=22|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=16|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=1|ip-26-0-170-31]: No checkpoint path provided. -[default0]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=24|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=2|ip-26-0-170-31]: No checkpoint path provided. -[default3]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=27|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=26|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=30|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=4|ip-26-0-170-31]: No checkpoint path provided. -[default2]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=18|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=6|ip-26-0-170-31]: No checkpoint path provided. -[default7]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=7|ip-26-0-170-31]: No checkpoint path provided. -[default3]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=3|ip-26-0-170-31]: No checkpoint path provided. -[default4]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=20|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=17|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=28|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=23|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=31|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=25|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=5|ip-26-0-170-31]: No checkpoint path provided. -[default5]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=29|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 09:23:03 [INFO|DP=1|PP=0|TP=19|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 09:23:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:23:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:23:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 09:23:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 09:23:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:23:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Using `datasets` library -[default0]:07/03/2024 09:23:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:23:05 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:23:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:23:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:23:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: -[default0]:07/03/2024 09:23:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: [Start training] datetime: 2024-07-03 09:23:06.650019 | mbs: 256 | grad_accum: 2 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:23:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:23:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-139]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default0]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=16|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=12|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=16|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=21|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=23|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=17|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=1|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=26|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=30|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=6|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=2|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=25|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=7|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=3|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=7|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=17|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=4|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=23|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=22|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=9|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=28|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=30|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=29|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=15|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=12|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=5|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=28|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=29|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=13|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=15|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=14|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=8|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=10|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=0|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=11|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=9|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=20|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=5|ip-26-0-169-139]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=24|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=27|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=1|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=4|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=22|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=8|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=2|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=18|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=27|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=24|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=18|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=3|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=6|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=14|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=20|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=21|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=11|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=10|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=13|ip-26-0-169-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=31|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=31|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=25|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=26|ip-26-0-169-247]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:23:06 [WARNING|DP=1|PP=0|TP=19|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:23:06 [WARNING|DP=0|PP=0|TP=19|ip-26-0-169-239]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank41]: output = self.pp_block(**new_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank41]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank41]: output = self.o_proj(attention_output) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank41]: return row_linear( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank41]: out = F.linear(input, weight, bias) -[default1]:[rank41]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank44]: output = self.pp_block(**new_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[def: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank44]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", linault6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -e 598, in forward -[default4]:[rank44]: output = self.o_proj(attention_output) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank44]: return row_linear( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank44]: out = F.linear(input, weight, bias) -[default4]:[rank44]: torch.cuda.OutOfMemoryError: CUDA out of m[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank38]: output = self.pp_block(**new_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]emory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank38]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", lin[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanote 598, in forward -[default6]:[rank38]: output = self.o_proj(attention_output) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank38]: return row_linear( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank38]: out = F.linear(input, weight, bias) -[default6]:[rank38]: torch.cuda.OutOfMemoryError: CUDA out of mron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[defemory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.92 GiB is free. Including non-PyTorch memory, this process has 76.39 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -ault3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank43]: output = self.pp_block(**new_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank43]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank43]: output = self.o_proj(attention_output) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank43]: return row_linear( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank43]: out = F.linear(input, weight, bias) -[default3]:[rank43]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank47]: output = self.pp_block(**new_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank47]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank47]: output = self.o_proj(attention_output) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank47]: return row_linear( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank47]: out = F.linear(input, weight, bias) -[default7]:[rank47]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank45]: output = self.pp_block(**new_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank45]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank45]: output = self.o_proj(attention_output) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank45]: return row_linear( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank45]: out = F.linear(input, weight, bias) -[default5]:[rank45]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank40]: output = self.pp_block(**new_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank40]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank40]: output = self.o_proj(attention_output) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank40]: return row_linear( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank40]: out = F.linear(input, weight, bias) -[default0]:[rank40]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank32]: output = self.pp_block(**new_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank32]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank32]: output = self.o_proj(attention_output) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank32]: return row_linear( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank32]: out = F.linear(input, weight, bias) -[default0]:[rank32]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: output = model(**micro_batch) -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: Traceback (most recent call last): -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_clus[default3]:[rank51]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: output = model(**micro_batch) -ter/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank35]: trainer.train(dataloader) -[default2]:[rank34]: output = self.pp_block(**new_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: output = self.o_proj(attention_output) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default5]:[rank53]: sharded_logits = self.model( -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank34]: return row_linear( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank34]: out = F.linear(input, weight, bias) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank34]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.92 GiB is free. Including non-PyTorch memory, this process has 76.39 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank51]: output = self.pp_block(**new_kwargs) -[default6]:[rank62]: output = self.pp_block(**new_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank35]: output = self.pp_block(**new_kwargs) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank63]: output = self.pp_block(**new_kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank35]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank35]: output = self.o_proj(attention_output) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_clust[default3]:[rank51]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank62]: return forward_call(*args, **kwargs) -er/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank35]: return row_linear( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank35]: out = F.linear(input, weight, bias) -[default3]:[rank35]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.74 GiB is free. Including non-PyTorch memory, this process has 76.58 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default3]:[rank51]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -n/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl - -[default4]:[rank36]: output = self.pp_block(**new_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank36]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/py[default3]:[rank51]: output = self.o_proj(attention_output) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -thon3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank36]: output = self.o_proj(attention_output) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank36]: return row_linear( -[default4]:[rank36]: File "[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank36]: out = F.linear(input, weight, bias) -[default4]:[rank36]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.92 GiB is free. Including non-PyTorch memory, this process has 76.39 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_clust[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank62]: output = self.o_proj(attention_output) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -er/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default5]:[rank53]: output = self.pp_block(**new_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: output = self.o_proj(attention_output) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return row_linear( -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank37]: output = self.pp_block(**new_kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank37]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank51]: out = F.linear(input, weight, bias) -[default6]:[rank62]: return row_linear( -[default5]:[rank37]: output = self.o_proj(attention_output) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank53]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return row_linear( -[default6]:[rank62]: out = F.linear(input, weight, bias) -[default5]:[rank37]: return row_linear( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank37]: out = F.linear(input, weight, bias) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank37]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.74 GiB is free. Including non-PyTorch memory, this process has 76.58 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: output = self.o_proj(attention_output) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default7]:[rank63]: out = F.linear(input, weight, bias) -[default6]:[rank62]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.44 GiB is free. Including non-PyTorch memory, this process has 75.88 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank53]: return row_linear( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank63]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.55 GiB is free. Including non-PyTorch memory, this process has 75.77 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: out = F.linear(input, weight, bias) -[default5]:[rank53]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank49]: output = self.pp_block(**new_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank49]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank49]: output = self.o_proj(attention_output) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank49]: return row_linear( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank49]: out = F.linear(input, weight, bias) -[default1]:[rank49]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank39]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: output = self.o_proj(attention_output) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank39]: return row_linear( -n/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank39]: out = F.linear(input, weight, bias) - -[default5]:[rank61]: output = self.pp_block(**new_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank61]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank61]: output = self.o_proj(attention_output) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 15[default7]:[rank39]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.74 GiB is free. Including non-PyTorch memory, this process has 76.58 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -9, in forward -[default5]:[rank61]: return row_linear( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank61]: out = F.linear(input, weight, bias) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default5]:[rank61]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.55 GiB is free. Including non-PyTorch memory, this process has 75.77 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank33]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank33]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank33]: output = self.o_proj(attention_output) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank33]: return row_linear( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank33]: out = F.linear(input, weight, bias) -[default1]:[rank33]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.74 GiB is free. Including non-PyTorch memory, this process has 76.58 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank46]: output = self.pp_block(**new_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank46]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank46]: output = self.o_proj(attention_output) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank46]: return row_linear( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank46]: out = F.linear(input, weight, bias) -[default6]:[rank46]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank48]: output = self.pp_block(**new_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank48]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank48]: output = self.o_proj(attention_output) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank48]: return row_linear( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank48]: out = F.linear(input, weight, bias) -[default0]:[rank48]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default0]:[rank56]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default0]:[rank56]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank60]: output = self.pp_block(**new_kwargs) -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank60]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: output = self.o_proj(attention_output) -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank56]: output = self.pp_block(**new_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return row_linear( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank60]: out = F.linear(input, weight, bias) -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank56]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.44 GiB is free. Including non-PyTorch memory, this process has 75.88 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank59]: output = self.pp_block(**new_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank56]: output = self.o_proj(attention_output) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank59]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank59]: output = self.o_proj(attention_output) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: return row_linear( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank56]: out = F.linear(input, weight, bias) -[default3]:[rank59]: return row_linear( -[default0]:[rank56]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: out = F.linear(input, weight, bias) -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.55 GiB is free. Including non-PyTorch memory, this process has 75.77 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank58]: output = self.pp_block(**new_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank58]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: output = self.o_proj(attention_output) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: return row_linear( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default2]:[rank58]: out = F.linear(input, weight, bias) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.44 GiB is free. Including non-PyTorch memory, this process has 75.88 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank57]: output = self.pp_block(**new_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank57]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank57]: output = self.o_proj(attention_output) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank57]: return row_linear( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank57]: out = F.linear(input, weight, bias) -[default1]:[rank57]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.55 GiB is free. Including non-PyTorch memory, this process has 75.77 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank52]: output = self.pp_block(**new_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank52]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank52]: output = self.o_proj(attention_output) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank52]: return row_linear( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank52]: out = F.linear(input, weight, bias) -[default4]:[rank52]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: output = self.pp_block(**new_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default2]:[rank50]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank54]: output = self.pp_block(**new_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank50]: output = self.o_proj(attention_output) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank54]: output = self.o_proj(attention_output) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return row_linear( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank50]: out = F.linear(input, weight, bias) -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default2]:[rank50]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank54]: return row_linear( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank54]: out = F.linear(input, weight, bias) -[default6]:[rank54]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank55]: output = self.pp_block(**new_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank55]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank55]: output = self.o_proj(attention_output) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank55]: return row_linear( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank55]: out = F.linear(input, weight, bias) -[default7]:[rank55]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank42]: output = self.pp_block(**new_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank42]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank42]: output = self.o_proj(attention_output) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank42]: return row_linear( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank42]: out = F.linear(input, weight, bias) -[default2]:[rank42]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -E0703 09:23:34.844000 140614064219968 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 961145) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:23:34 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 961146) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:23:34 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 961147) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:23:34 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 961148) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:23:34 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 961149) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:23:34 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 961150) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:23:34 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 961151) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:23:34 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 961152) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:23:34 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 961145) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-171-88: task 7: Exited with exit code 1 -E0703 09:23:39.848000 139669372913472 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3424305) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:23:39.850000 140430005966656 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3093041) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:23:39.854000 140591072921408 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3973042) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-56.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 3424306) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-56.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 3424307) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-56.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 3424308) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-56.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 3424309) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-56.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 3424310) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-56.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 3424311) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-56.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 3424312) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-56.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 3424305) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:23:39 - host : ip-26-0-170-31.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 3093042) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:23:39 - host : ip-26-0-170-31.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 3093043) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:23:39 - host : ip-26-0-170-31.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 3093044) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:23:39 - host : ip-26-0-170-31.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 3093045) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:23:39 - host : ip-26-0-170-31.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 3093046) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:23:39 - host : ip-26-0-170-31.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 3093047) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:23:39 - host : ip-26-0-170-31.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 3093048) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:23:39 - host : ip-26-0-170-31.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 3093041) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 3973043) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-62.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 3973044) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 3973045) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 3973046) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-62.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 3973047) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 3973048) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-62.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 3973049) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:23:39 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 3973042) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-171-56: task 5: Exited with exit code 1 -srun: error: ip-26-0-170-31: task 4: Exited with exit code 1 -srun: error: ip-26-0-171-62: task 6: Exited with exit code 1 -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: output = model(**micro_batch) -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: output = self.o_proj(attention_output) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default0]:[rank16]: output = self.o_proj(attention_output) -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: return row_linear( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank27]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: output = self.o_proj(attention_output) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.92 GiB is free. Including non-PyTorch memory, this process has 76.39 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank0]: output = self.o_proj(attention_output) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank29]: Traceback (most recent call last): -[default4]:[rank4]: output = self.o_proj(attention_output) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.55 GiB is free. Including non-PyTorch memory, this process has 75.77 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: output = model(**micro_batch) -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: sharded_logits = self.model( -[default2]:[rank26]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default0]:[rank24]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: trainer.train(dataloader) -[default2]:[rank26]: sharded_logits = self.model( -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank30]: sharded_logits = self.model( -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: output = self.o_proj(attention_output) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotr[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -on/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: output = self.o_proj(attention_output) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return row_linear( -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default2]:[rank26]: output = self.o_proj(attention_output) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.92 GiB is free. Including non-PyTorch memory, this process has 76.39 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: output = self.o_proj(attention_output) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.74 GiB is free. Including non-PyTorch memory, this process has 76.58 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.74 GiB is free. Including non-PyTorch memory, this process has 76.58 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank23]: output = self.o_proj(attention_output) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default1]:[rank17]: trainer.train(dataloader) -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default5]:[rank21]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: sharded_logits = self.model( -[default5]:[rank21]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: trainer.train(dataloader) -[default7]:[rank15]: Traceback (most recent call last): -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank15]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: output = model(**micro_batch) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: sharded_logits = self.model( -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.55 GiB is free. Including non-PyTorch memory, this process has 75.77 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default5]:[rank5]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: Traceback (most recent call last): -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: trainer.train(dataloader) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: trainer.train(dataloader) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = self.o_proj(attention_output) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: output = model(**micro_batch) -[default2]:[rank18]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default0]:[rank8]: output = self.o_proj(attention_output) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: output = model(**micro_batch) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: sharded_logits = self.model( -[default5]:[rank5]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: sharded_logits = self.model( -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: sharded_logits = self.model( -[default1]:[rank17]: output = self.o_proj(attention_output) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: return row_linear( -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank25]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: return row_linear( -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.92 GiB is free. Including non-PyTorch memory, this process has 76.39 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: output = self.o_proj(attention_output) -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: output = self.o_proj(attention_output) -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: output = self.o_proj(attention_output) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: return row_linear( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default4]:[rank28]: output = self.o_proj(attention_output) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.44 GiB is free. Including non-PyTorch memory, this process has 75.88 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: output = self.o_proj(attention_output) -[default5]:[rank5]: return row_linear( -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.44 GiB is free. Including non-PyTorch memory, this process has 75.88 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.44 GiB is free. Including non-PyTorch memory, this process has 75.88 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: trainer.train(dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwar[default4]:[rank28]: return row_linear( -gs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: return row_linear( -[default1]:[rank1]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.74 GiB is free. Including non-PyTorch memory, this process has 76.58 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.92 GiB is free. Including non-PyTorch memory, this process has 76.39 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.44 GiB is free. Including non-PyTorch memory, this process has 75.88 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.55 GiB is free. Including non-PyTorch memory, this process has 75.77 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank22]: output = self.o_proj(attention_output) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default7]:[rank15]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default3]:[rank11]: output = self.o_proj(attention_output) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return row_linear( -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default4]:[rank12]: Traceback (most recent call last): -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: Traceback (most recent call last): -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: trainer.train(dataloader) -[default4]:[rank12]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: trainer.train(dataloader) -[default5]:[rank13]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: output = model(**micro_batch) -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: output = model(**micro_batch) -[default6]:[rank14]: output = model(**micro_batch) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default4]:[rank12]: sharded_logits = self.model( -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default5]:[rank13]: sharded_logits = self.model( -[default2]:[rank10]: sharded_logits = self.model( -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return row_linear( -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default1]:[rank9]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank12]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return row_linear( -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.67 GiB is free. Including non-PyTorch memory, this process has 76.65 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank10]: output = self.o_proj(attention_output) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.56 GiB is free. Including non-PyTorch memory, this process has 76.75 GiB memory in use. Of the allocated memory 66.62 GiB is allocated by PyTorch, and 230.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -E0703 09:23:54.869000 139989571295040 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 563284) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:23:54 - host : ip-26-0-169-139.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 563285) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:23:54 - host : ip-26-0-169-139.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 563286) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:23:54 - host : ip-26-0-169-139.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 563287) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:23:54 - host : ip-26-0-169-139.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 563288) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:23:54 - host : ip-26-0-169-139.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 563289) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:23:54 - host : ip-26-0-169-139.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 563290) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:23:54 - host : ip-26-0-169-139.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 563291) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:23:54 - host : ip-26-0-169-139.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 563284) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-169-139: task 0: Exited with exit code 1 -W0703 09:23:59.521000 140086892566272 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-239.ec2.internal_2552771_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:23:59.602000 140100569286400 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-247.ec2.internal_33038_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:23:59.687000 139758076221184 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-207.ec2.internal_2581262_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -E0703 09:23:59.793000 139763736954688 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2581335) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:23:59.799000 139763736954688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2581262_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:23:59.825000 139763736954688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2581262_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:23:59.855000 139763736954688 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2581262_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-207.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 2581336) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-207.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 2581337) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-207.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 2581338) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-207.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 2581339) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-207.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 2581340) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-207.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 2581341) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-207.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 2581342) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-207.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 2581335) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:23:59.875000 140106230019904 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 33112) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:23:59.877000 140092553299776 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2552845) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 09:23:59.881000 140106230019904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_33038_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:23:59.884000 140092553299776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2552771_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:23:59.915000 140106230019904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_33038_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:23:59.917000 140092553299776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2552771_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 09:23:59.946000 140092553299776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-239.ec2.internal_2552771_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-239.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 2552846) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-239.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 2552847) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-239.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 2552848) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-239.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 2552849) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-239.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 2552850) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-239.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 2552851) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-239.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 2552852) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-239.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 2552845) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -W0703 09:23:59.954000 140106230019904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-247.ec2.internal_33038_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-247.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 33113) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-247.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 33114) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-247.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 33115) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-247.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 33116) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-247.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 33117) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-247.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 33118) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-247.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 33119) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:23:59 - host : ip-26-0-169-247.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 33112) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-169-207: task 1: Exited with exit code 1 -srun: error: ip-26-0-169-247: task 3: Exited with exit code 1 -srun: error: ip-26-0-169-239: task 2: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/status.txt b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-256/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/bench.slurm deleted file mode 100644 index 54b16c7e7400c552bdbf4b0bf6f177e9f1f3e253..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/config.yaml deleted file mode 100644 index 331d213ad97f40b9cb984363f63abe360bb03a58..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 16 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 32 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/log.out deleted file mode 100644 index 03c8c7ee01883da4419b53931a2a7aa3b7d70e22..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/log.out +++ /dev/null @@ -1,734 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:21:42 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:21:44.674000 139982020101952 torch/distributed/run.py:757] -W0703 03:21:44.674000 139982020101952 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.674000 139982020101952 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:21:44.674000 139982020101952 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.673000 140055400159040 torch/distributed/run.py:757] -W0703 03:21:44.673000 140055400159040 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.673000 140055400159040 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:21:44.673000 140055400159040 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.686000 140083007547200 torch/distributed/run.py:757] -W0703 03:21:44.686000 140083007547200 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.686000 140083007547200 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:21:44.686000 140083007547200 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.686000 140247774861120 torch/distributed/run.py:757] -W0703 03:21:44.686000 140247774861120 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.686000 140247774861120 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:21:44.686000 140247774861120 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.696000 140048238819136 torch/distributed/run.py:757] -W0703 03:21:44.696000 140048238819136 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.696000 140048238819136 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:21:44.696000 140048238819136 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.700000 140608614643520 torch/distributed/run.py:757] -W0703 03:21:44.700000 140608614643520 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.700000 140608614643520 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:21:44.700000 140608614643520 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.710000 140025939732288 torch/distributed/run.py:757] -W0703 03:21:44.710000 140025939732288 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.710000 140025939732288 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:21:44.710000 140025939732288 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.743000 140580703721280 torch/distributed/run.py:757] -W0703 03:21:44.743000 140580703721280 torch/distributed/run.py:757] ***************************************** -W0703 03:21:44.743000 140580703721280 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:21:44.743000 140580703721280 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:22:05 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=32, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=32, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=16, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32')), -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 03:22:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: No checkpoint path provided. -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 03:22:23 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=28|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=26|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=24|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=27|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=25|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=29|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=31|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=30|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=16|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=23|ip-26-0-172-57]: No checkpoint path provided. -[default5]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=21|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=19|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=18|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=22|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=17|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=20|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 03:22:23 [INFO|DP=1|PP=0|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 03:22:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:22:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:22:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 03:22:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 03:22:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:22:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 03:22:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 03:22:26 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:22:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:22:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:22:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 03:22:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 03:22:27.380750 | mbs: 32 | grad_accum: 16 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:22:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:22:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=27|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=28|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=18|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=16|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=21|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=30|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=26|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=24|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=29|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=27|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=24|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=17|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=31|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=23|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=22|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=19|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=28|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=31|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=25|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=23|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=21|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=19|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=18|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=22|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=26|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=20|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=25|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=29|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=30|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:22:27 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=17|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=20|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:22:27 [WARNING|DP=1|PP=0|TP=16|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:07/03/2024 03:22:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 367.59MiB. Peak allocated 33591.14MiB. Peak reserved: 34838.00MiB -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]: warnings.warn( -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:07/03/2024 03:22:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 22K | tokens_per_sec: 191K | tokens_per_sec_per_gpu: 2.98K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 0.0001 | model_tflops_per_gpu: 27 | hardware_tflops_per_gpu: 27 | grad_norm: 11.1 | cuda_memory_allocated: 525M | cuda_max_memory_reserved: 36.6G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 03:22:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 666.14MiB. Peak reserved: 34890.00MiB -[default0]:07/03/2024 03:22:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.33MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:23:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 10.9K | tokens_per_sec: 385K | tokens_per_sec_per_gpu: 6.01K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 9.53e-05 | model_tflops_per_gpu: 54.5 | hardware_tflops_per_gpu: 54.5 | grad_norm: 11.1 | cuda_memory_allocated: 525M | cuda_max_memory_reserved: 36.7G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 03:23:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 666.15MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:23:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.33MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:23:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 10.8K | tokens_per_sec: 389K | tokens_per_sec_per_gpu: 6.08K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.05e-05 | model_tflops_per_gpu: 55.2 | hardware_tflops_per_gpu: 55.2 | grad_norm: 76 | cuda_memory_allocated: 525M | cuda_max_memory_reserved: 36.7G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 03:23:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 666.15MiB. Peak reserved: 34958.00MiB -[default0]:STAGE:2024-07-03 03:23:11 1138202:1138202 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 03:23:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.33MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:23:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 10.6K | tokens_per_sec: 397K | tokens_per_sec_per_gpu: 6.21K | global_batch_size: 1.02K | lm_loss: 11.5 | lr: 8.58e-05 | model_tflops_per_gpu: 56.3 | hardware_tflops_per_gpu: 56.3 | grad_norm: 15.1 | cuda_memory_allocated: 525M | cuda_max_memory_reserved: 36.7G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 03:23:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 666.15MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:23:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 399K | tokens_per_sec_per_gpu: 6.23K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 8.11e-05 | model_tflops_per_gpu: 56.5 | hardware_tflops_per_gpu: 56.5 | grad_norm: 24.2 -[default0]:07/03/2024 03:23:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:23:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 398K | tokens_per_sec_per_gpu: 6.22K | global_batch_size: 1.02K | lm_loss: 10.1 | lr: 7.63e-05 | model_tflops_per_gpu: 56.5 | hardware_tflops_per_gpu: 56.5 | grad_norm: 12.8 -[default0]:STAGE:2024-07-03 03:23:51 1138202:1138202 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 03:23:52 1138202:1138202 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default0]:07/03/2024 03:25:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:25:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 7 / 20 | consumed_tokens: 29.4M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 398K | tokens_per_sec_per_gpu: 6.22K | global_batch_size: 1.02K | lm_loss: 9.9 | lr: 7.16e-05 | model_tflops_per_gpu: 56.5 | hardware_tflops_per_gpu: 56.5 | grad_norm: 9.28 -[default0]:07/03/2024 03:25:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:25:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 8 / 20 | consumed_tokens: 33.6M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 400K | tokens_per_sec_per_gpu: 6.25K | global_batch_size: 1.02K | lm_loss: 9.56 | lr: 6.68e-05 | model_tflops_per_gpu: 56.7 | hardware_tflops_per_gpu: 56.7 | grad_norm: 6.91 -[default0]:07/03/2024 03:25:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:25:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 9 / 20 | consumed_tokens: 37.7M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 399K | tokens_per_sec_per_gpu: 6.23K | global_batch_size: 1.02K | lm_loss: 9.2 | lr: 6.21e-05 | model_tflops_per_gpu: 56.5 | hardware_tflops_per_gpu: 56.5 | grad_norm: 6.5 -[default0]:07/03/2024 03:25:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:25:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 10 / 20 | consumed_tokens: 41.9M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 399K | tokens_per_sec_per_gpu: 6.23K | global_batch_size: 1.02K | lm_loss: 8.88 | lr: 5.74e-05 | model_tflops_per_gpu: 56.5 | hardware_tflops_per_gpu: 56.5 | grad_norm: 5.55 -[default0]:07/03/2024 03:25:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:25:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 11 / 20 | consumed_tokens: 46.1M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 400K | tokens_per_sec_per_gpu: 6.25K | global_batch_size: 1.02K | lm_loss: 8.69 | lr: 5.26e-05 | model_tflops_per_gpu: 56.7 | hardware_tflops_per_gpu: 56.7 | grad_norm: 5.86 -[default0]:07/03/2024 03:25:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:26:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 12 / 20 | consumed_tokens: 50.3M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 401K | tokens_per_sec_per_gpu: 6.27K | global_batch_size: 1.02K | lm_loss: 8.48 | lr: 4.79e-05 | model_tflops_per_gpu: 56.9 | hardware_tflops_per_gpu: 56.9 | grad_norm: 5.82 -[default0]:07/03/2024 03:26:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:26:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 13 / 20 | consumed_tokens: 54.5M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 401K | tokens_per_sec_per_gpu: 6.27K | global_batch_size: 1.02K | lm_loss: 8.25 | lr: 4.32e-05 | model_tflops_per_gpu: 56.9 | hardware_tflops_per_gpu: 56.9 | grad_norm: 5.08 -[default0]:07/03/2024 03:26:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:26:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 14 / 20 | consumed_tokens: 58.7M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 400K | tokens_per_sec_per_gpu: 6.26K | global_batch_size: 1.02K | lm_loss: 8.1 | lr: 3.84e-05 | model_tflops_per_gpu: 56.8 | hardware_tflops_per_gpu: 56.8 | grad_norm: 5.08 -[default0]:07/03/2024 03:26:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:26:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 15 / 20 | consumed_tokens: 62.9M | elapsed_time_per_iteration_ms: 10.4K | tokens_per_sec: 402K | tokens_per_sec_per_gpu: 6.27K | global_batch_size: 1.02K | lm_loss: 7.99 | lr: 3.37e-05 | model_tflops_per_gpu: 56.9 | hardware_tflops_per_gpu: 56.9 | grad_norm: 5.11 -[default0]:07/03/2024 03:26:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 16 / 20 | consumed_tokens: 67.1M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 401K | tokens_per_sec_per_gpu: 6.27K | global_batch_size: 1.02K | lm_loss: 7.9 | lr: 2.89e-05 | model_tflops_per_gpu: 56.8 | hardware_tflops_per_gpu: 56.8 | grad_norm: 5.13 -[default0]:07/03/2024 03:26:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:26:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 17 / 20 | consumed_tokens: 71.3M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 401K | tokens_per_sec_per_gpu: 6.27K | global_batch_size: 1.02K | lm_loss: 7.78 | lr: 2.42e-05 | model_tflops_per_gpu: 56.9 | hardware_tflops_per_gpu: 56.9 | grad_norm: 4.9 -[default0]:07/03/2024 03:26:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:27:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 18 / 20 | consumed_tokens: 75.5M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 401K | tokens_per_sec_per_gpu: 6.27K | global_batch_size: 1.02K | lm_loss: 7.67 | lr: 1.95e-05 | model_tflops_per_gpu: 56.9 | hardware_tflops_per_gpu: 56.9 | grad_norm: 4.65 -[default0]:07/03/2024 03:27:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:27:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 19 / 20 | consumed_tokens: 79.7M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 400K | tokens_per_sec_per_gpu: 6.25K | global_batch_size: 1.02K | lm_loss: 7.58 | lr: 1.47e-05 | model_tflops_per_gpu: 56.7 | hardware_tflops_per_gpu: 56.7 | grad_norm: 4.55 -[default0]:07/03/2024 03:27:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 500.32MiB. Peak allocated 33723.88MiB. Peak reserved: 34958.00MiB -[default0]:07/03/2024 03:27:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 20 / 20 | consumed_tokens: 83.9M | elapsed_time_per_iteration_ms: 10.5K | tokens_per_sec: 400K | tokens_per_sec_per_gpu: 6.25K | global_batch_size: 1.02K | lm_loss: 7.53 | lr: 1e-05 | model_tflops_per_gpu: 56.7 | hardware_tflops_per_gpu: 56.7 | grad_norm: 4.51 -W0703 03:27:52.458000 140049739425536 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_903397_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousTimeoutError. -Saved 1 csv files over 1 completed logs -Processing file: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/profiler/ip-26-0-160-192_1138202.1719977088028445270.pt.trace.json -Results written to /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-32/profiler.csv -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-160-192_1138202.1719977088028445270.pt.trace.json: 0%| | 0.00/2.30G [00:00 $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/config.yaml deleted file mode 100644 index d767f0bdf10fa0ddc7238a685da1b507aab1db6f..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 128 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 4 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/log.out deleted file mode 100644 index 8c8a801b8b01a430c57296cff138e82ed3820d24..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4/log.out +++ /dev/null @@ -1,2302 +0,0 @@ -======================== -START TIME: Wed Jul 3 01:13:48 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 01:13:55.783000 139740769310528 torch/distributed/run.py:757] -W0703 01:13:55.783000 139740769310528 torch/distributed/run.py:757] ***************************************** -W0703 01:13:55.783000 139740769310528 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:13:55.783000 139740769310528 torch/distributed/run.py:757] ***************************************** -W0703 01:13:55.824000 140541891254080 torch/distributed/run.py:757] -W0703 01:13:55.824000 140541891254080 torch/distributed/run.py:757] ***************************************** -W0703 01:13:55.824000 140541891254080 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:13:55.824000 140541891254080 torch/distributed/run.py:757] ***************************************** -W0703 01:13:55.949000 140010758211392 torch/distributed/run.py:757] -W0703 01:13:55.949000 140010758211392 torch/distributed/run.py:757] ***************************************** -W0703 01:13:55.949000 140010758211392 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:13:55.949000 140010758211392 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.038000 139864455763776 torch/distributed/run.py:757] -W0703 01:13:56.038000 139864455763776 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.038000 139864455763776 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:13:56.038000 139864455763776 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.040000 140006959662912 torch/distributed/run.py:757] -W0703 01:13:56.040000 140006959662912 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.040000 140006959662912 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:13:56.040000 140006959662912 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.076000 139650531702592 torch/distributed/run.py:757] -W0703 01:13:56.076000 139650531702592 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.076000 139650531702592 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:13:56.076000 139650531702592 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.361000 139625482209088 torch/distributed/run.py:757] -W0703 01:13:56.361000 139625482209088 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.361000 139625482209088 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:13:56.361000 139625482209088 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.646000 140125971289920 torch/distributed/run.py:757] -W0703 01:13:56.646000 140125971289920 torch/distributed/run.py:757] ***************************************** -W0703 01:13:56.646000 140125971289920 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:13:56.646000 140125971289920 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 01:14:22 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=32, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=4, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=128, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-4')), -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 01:14:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: No checkpoint path provided. -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 01:14:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default1]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=25|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=26|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=30|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=28|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=27|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=29|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=24|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=17|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=31|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=16|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=22|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=19|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=21|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=23|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=20|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=18|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 01:14:46 [INFO|DP=1|PP=0|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 01:14:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 01:14:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 01:14:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 01:14:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 01:14:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 01:14:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 01:14:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 01:14:49 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:14:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 01:14:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 01:14:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 01:14:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 01:14:51.324463 | mbs: 4 | grad_accum: 128 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 01:14:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 01:14:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=24|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=27|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=26|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=26|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=16|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=17|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=27|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=21|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=23|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=24|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=29|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=17|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=30|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=28|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=31|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=31|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=29|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=22|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=21|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=20|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=18|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=25|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=30|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=19|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=25|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=16|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=19|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=23|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:14:51 [WARNING|DP=1|PP=0|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=18|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=22|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:14:51 [WARNING|DP=0|PP=0|TP=20|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:14:52 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:14:52 [WARNING|DP=1|PP=0|TP=28|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:14:52 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:14:52 [WARNING|DP=1|PP=0|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default5]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:07/03/2024 01:15:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 365.68MiB. Peak allocated 4518.53MiB. Peak reserved: 4872.00MiB -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:07/03/2024 01:15:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 55.9K | tokens_per_sec: 75.1K | tokens_per_sec_per_gpu: 1.17K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 0.0001 | model_tflops_per_gpu: 10.6 | hardware_tflops_per_gpu: 10.6 | grad_norm: 11.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 5.17G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 01:15:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.41MiB. Peak allocated 664.23MiB. Peak reserved: 4932.00MiB -[default0]:07/03/2024 01:16:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.47MiB. Peak allocated 4651.32MiB. Peak reserved: 4992.00MiB -[default0]:07/03/2024 01:16:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 18.5K | tokens_per_sec: 226K | tokens_per_sec_per_gpu: 3.54K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 9.53e-05 | model_tflops_per_gpu: 32.1 | hardware_tflops_per_gpu: 32.1 | grad_norm: 11.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 5.23G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 01:16:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.41MiB. Peak allocated 664.29MiB. Peak reserved: 4992.00MiB -[default0]:07/03/2024 01:16:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.47MiB. Peak allocated 4651.32MiB. Peak reserved: 4992.00MiB -[default0]:07/03/2024 01:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 20.3K | tokens_per_sec: 207K | tokens_per_sec_per_gpu: 3.24K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.05e-05 | model_tflops_per_gpu: 29.4 | hardware_tflops_per_gpu: 29.4 | grad_norm: 76 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 5.23G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 01:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.41MiB. Peak allocated 664.29MiB. Peak reserved: 4992.00MiB -[default0]:STAGE:2024-07-03 01:16:26 1115867:1115867 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 01:16:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.47MiB. Peak allocated 4651.32MiB. Peak reserved: 4992.00MiB -[default0]:07/03/2024 01:16:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 27.2K | tokens_per_sec: 154K | tokens_per_sec_per_gpu: 2.41K | global_batch_size: 1.02K | lm_loss: 11.5 | lr: 8.58e-05 | model_tflops_per_gpu: 21.8 | hardware_tflops_per_gpu: 21.8 | grad_norm: 15.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 5.23G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 01:16:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.41MiB. Peak allocated 664.29MiB. Peak reserved: 4992.00MiB -[default0]:07/03/2024 01:17:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 28.4K | tokens_per_sec: 148K | tokens_per_sec_per_gpu: 2.31K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 8.11e-05 | model_tflops_per_gpu: 20.9 | hardware_tflops_per_gpu: 20.9 | grad_norm: 24.3 -[default0]:07/03/2024 01:17:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.41MiB. Peak allocated 4651.32MiB. Peak reserved: 4992.00MiB -[default0]:07/03/2024 01:17:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 27.4K | tokens_per_sec: 153K | tokens_per_sec_per_gpu: 2.39K | global_batch_size: 1.02K | lm_loss: 10.1 | lr: 7.63e-05 | model_tflops_per_gpu: 21.7 | hardware_tflops_per_gpu: 21.7 | grad_norm: 12.8 -[default0]:STAGE:2024-07-03 01:19:02 1115867:1115867 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 01:19:09 1115867:1115867 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 19] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 29] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600048 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 19] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 19] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600052 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600076 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600089 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=34775040, NumelOut=34775040, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178266, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 601946 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178367, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 602049 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178343, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 602250 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178363, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 602269 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178264, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 602542 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178252, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 602626 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178257, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 602735 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178256, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 602670 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178260, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 602735 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178255, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 602846 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178264, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 602886 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178255, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 602904 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178367, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 602844 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178257, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 603037 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178251, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 603083 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178261, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 602998 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178261, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 603044 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178248, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 603100 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178252, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 603095 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178260, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 603091 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178251, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 603178 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178247, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 603134 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178264, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 603129 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178279, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 603098 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178254, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=33554432, Timeout(ms)=600000) ran for 603193 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178261, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 603243 milliseconds before timing out. -[default0]:07/03/2024 01:28:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.41MiB. Peak allocated 4651.32MiB. Peak reserved: 4992.00MiB -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 18] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:577] [Rank 18] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:583] [Rank 18] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 16] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:577] [Rank 16] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:583] [Rank 16] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 16] Process group watchdog thread terminated with exception: [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb8c330d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb8c45e6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb8c45eba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb8c45ecdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb910085e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb9150cc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb914e97353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 16] Process group watchdog thread terminated with exception: [Rank 16] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb8c330d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb8c45e6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb8c45eba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb8c45ecdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fb910085e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fb9150cc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fb914e97353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb8c330d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fb8c4270119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fb910085e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fb9150cc609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fb914e97353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 19] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:577] [Rank 19] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:583] [Rank 19] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 20] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 19] Process group watchdog thread terminated with exception: [Rank 19] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0891d3a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 22] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:577] [Rank 22] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0893013c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:583] [Rank 22] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:577] [Rank 20] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0893018a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:583] [Rank 20] To avoid data inconsistency, we are taking the entire process down. -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0893019dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f08deab2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 22] Process group watchdog thread terminated with exception: [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 20] Process group watchdog thread terminated with exception: [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default3]:frame #5: + 0x8609 (0x7f08e3af9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #6: clone + 0x43 (0x7f08e38c4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f970cd32897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]: -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f970e00bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f970e010a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: what(): [PG 2 Rank 19] Process group watchdog thread terminated with exception: [Rank 19] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea802e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f970e011dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea815bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f9759aaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f975eaf1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea815c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #6: clone + 0x43 (0x7f975e8bc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0891d3a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f0893013c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea815c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7feacd05ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f0893018a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:frame #5: + 0x8609 (0x7fead20a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f0893019dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: what(): [PG 2 Rank 22] Process group watchdog thread terminated with exception: [Rank 22] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default4]:frame #6: clone + 0x43 (0x7fead1e6e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]: -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f970cd32897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #4: + 0xd3e95 (0x7f08deab2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f970e00bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f970e010a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f970e011dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]: what(): [PG 2 Rank 20] Process group watchdog thread terminated with exception: [Rank 20] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #4: + 0xd3e95 (0x7f9759aaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f975eaf1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #5: + 0x8609 (0x7f08e3af9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea802e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fea815bdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #6: clone + 0x43 (0x7f08e38c4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]:frame #6: clone + 0x43 (0x7f975e8bc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fea815c2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fea815c3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]: -[default3]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #4: + 0xd3e95 (0x7feacd05ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fead20a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f970cd32897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f0891d3a897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #6: clone + 0x43 (0x7fead1e6e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #1: + 0xe32119 (0x7f0892c9d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #1: + 0xe32119 (0x7f970dc95119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f9759aaae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: -[default3]:frame #2: + 0xd3e95 (0x7f08deab2e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #3: + 0x8609 (0x7f975eaf1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f975e8bc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #3: + 0x8609 (0x7f08e3af9609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fea802e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]: -[default3]:frame #4: clone + 0x43 (0x7f08e38c4353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default4]:frame #1: + 0xe32119 (0x7fea81247119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7feacd05ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fead20a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fead1e6e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 17] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:577] [Rank 17] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:583] [Rank 17] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 17] Process group watchdog thread terminated with exception: [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5639b53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f563ae2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f563ae31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f563ae32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f56868cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f568b912609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f568b6dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 17] Process group watchdog thread terminated with exception: [Rank 17] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5639b53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f563ae2cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f563ae31a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f563ae32dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f56868cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f568b912609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f568b6dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5639b53897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f563aab6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f56868cbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f568b912609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f568b6dd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 21] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:577] [Rank 21] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:583] [Rank 21] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 21] Process group watchdog thread terminated with exception: [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcf52433897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcf5370cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcf53711a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcf53712dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fcf9f1abe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fcfa41f2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fcfa3fbd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 21] Process group watchdog thread terminated with exception: [Rank 21] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcf52433897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fcf5370cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fcf53711a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fcf53712dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fcf9f1abe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fcfa41f2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fcfa3fbd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fcf52433897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fcf53396119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fcf9f1abe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fcfa41f2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fcfa3fbd353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 23] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:577] [Rank 23] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:583] [Rank 23] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 23] Process group watchdog thread terminated with exception: [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9764f8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9766267c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f976626ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f976626ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f97b1d06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f97b6d4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f97b6b18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 23] Process group watchdog thread terminated with exception: [Rank 23] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9764f8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9766267c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f976626ca80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f976626ddcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f97b1d06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f97b6d4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f97b6b18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9764f8e897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f9765ef1119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f97b1d06e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f97b6d4d609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f97b6b18353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 18] Process group watchdog thread terminated with exception: [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa86392c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa864c05c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa864c0aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa864c0bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa8b06a4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa8b56eb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa8b54b6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 18] Process group watchdog thread terminated with exception: [Rank 18] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa86392c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa864c05c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa864c0aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa864c0bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa8b06a4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa8b56eb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa8b54b6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa86392c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fa86488f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fa8b06a4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fa8b56eb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fa8b54b6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 28] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:577] [Rank 28] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:583] [Rank 28] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 29] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:577] [Rank 29] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:583] [Rank 29] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 29] Process group watchdog thread terminated with exception: [Rank 29] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faae0aa6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faae1d7fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faae1d84a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faae1d85dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fab2d81ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fab32865609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fab32630353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 29] Process group watchdog thread terminated with exception: [Rank 29] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faae0aa6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faae1d7fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faae1d84a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faae1d85dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fab2d81ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fab32865609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fab32630353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faae0aa6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7faae1a09119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fab2d81ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fab32865609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fab32630353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 24] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:577] [Rank 24] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:583] [Rank 24] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 24] Process group watchdog thread terminated with exception: [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff904786897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff905a5fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff905a64a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff905a65dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff9514fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7ff956545609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff956310353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 24] Process group watchdog thread terminated with exception: [Rank 24] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff904786897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 27] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:577] [Rank 27] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:583] [Rank 27] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 27] Process group watchdog thread terminated with exception: [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff905a5fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff905a64a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff905a65dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7ff9514fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9bfb95897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd9c0e6ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #5: + 0x8609 (0x7ff956545609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7ff956310353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd9c0e73a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff904786897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd9c0e74dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #1: + 0xe32119 (0x7ff9056e9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7ff9514fee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7ff956545609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: + 0xd3e95 (0x7fda0c90de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fda11954609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7ff956310353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default3]:frame #6: clone + 0x43 (0x7fda1171f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 27] Process group watchdog thread terminated with exception: [Rank 27] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600007 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9bfb95897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd9c0e6ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd9c0e73a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd9c0e74dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7fda0c90de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7fda11954609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7fda1171f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9bfb95897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7fd9c0af8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7fda0c90de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7fda11954609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7fda1171f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 30] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:577] [Rank 30] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:583] [Rank 30] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 30] Process group watchdog thread terminated with exception: [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f99760e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f99773bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f99773c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f99773c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f99c2e5ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f99c7ea5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f99c7c70353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 30] Process group watchdog thread terminated with exception: [Rank 30] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f99760e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f99773bfc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f99773c4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f99773c5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f99c2e5ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f99c7ea5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f99c7c70353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f99760e6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f9977049119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f99c2e5ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f99c7ea5609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f99c7c70353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 31] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:577] [Rank 31] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:583] [Rank 31] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 31] Process group watchdog thread terminated with exception: [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8dec03897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8dfedcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8dfee1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8dfee2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa92b97be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa9309c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa93078d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 31] Process group watchdog thread terminated with exception: [Rank 31] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8dec03897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa8dfedcc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa8dfee1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa8dfee2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7fa92b97be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7fa9309c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7fa93078d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa8dec03897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7fa8dfb66119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7fa92b97be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7fa9309c2609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7fa93078d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 26] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:577] [Rank 26] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:583] [Rank 26] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 26] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa94d656897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa94e92fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa94e934a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa94e935dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa99a3cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa99f415609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa99f1e0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 26] Process group watchdog thread terminated with exception: [Rank 26] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600098 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa94d656897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fa94e92fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fa94e934a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fa94e935dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa99a3cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa99f415609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa99f1e0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fa94d656897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7fa94e5b9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fa99a3cee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fa99f415609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 25] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:577] [Rank 25] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:frame #4: clone + 0x43 (0x7fa99f1e0353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:583] [Rank 25] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 25] Process group watchdog thread terminated with exception: [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1657b8d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1658e66c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1658e6ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1658e6cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f16a4905e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]: -[default1]:frame #5: + 0x8609 (0x7f16a994c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f16a9717353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 28] Process group watchdog thread terminated with exception: [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8d62614897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8d638edc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8d638f2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8d638f3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f8daf38ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]: what(): [PG 2 Rank 25] Process group watchdog thread terminated with exception: [Rank 25] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #5: + 0x8609 (0x7f8db43d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f8db419e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1657b8d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1658e66c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1658e6ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1658e6cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f16a4905e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: what(): [PG 2 Rank 28] Process group watchdog thread terminated with exception: [Rank 28] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600085 milliseconds before timing out. -[default1]:frame #5: + 0x8609 (0x7f16a994c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f16a9717353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8d62614897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f8d638edc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f8d638f2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f8d638f3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]: -[default4]:frame #4: + 0xd3e95 (0x7f8daf38ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f8db43d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f8db419e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1657b8d897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f1658af0119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f16a4905e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f16a994c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #4: clone + 0x43 (0x7f16a9717353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f8d62614897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f8d63577119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f8daf38ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f8db43d3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f8db419e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -E0703 01:31:23.786000 140125971289920 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 753288) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:31:23 - host : ip-26-0-163-220.ec2.internal - rank : 17 (local_rank: 1) - exitcode : -6 (pid: 753289) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 753289 -[2]: - time : 2024-07-03_01:31:23 - host : ip-26-0-163-220.ec2.internal - rank : 18 (local_rank: 2) - exitcode : -6 (pid: 753290) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 753290 -[3]: - time : 2024-07-03_01:31:23 - host : ip-26-0-163-220.ec2.internal - rank : 19 (local_rank: 3) - exitcode : -6 (pid: 753291) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 753291 -[4]: - time : 2024-07-03_01:31:23 - host : ip-26-0-163-220.ec2.internal - rank : 20 (local_rank: 4) - exitcode : -6 (pid: 753292) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 753292 -[5]: - time : 2024-07-03_01:31:23 - host : ip-26-0-163-220.ec2.internal - rank : 21 (local_rank: 5) - exitcode : -6 (pid: 753293) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 753293 -[6]: - time : 2024-07-03_01:31:23 - host : ip-26-0-163-220.ec2.internal - rank : 22 (local_rank: 6) - exitcode : -6 (pid: 753294) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 753294 -[7]: - time : 2024-07-03_01:31:23 - host : ip-26-0-163-220.ec2.internal - rank : 23 (local_rank: 7) - exitcode : -6 (pid: 753295) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 753295 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:31:23 - host : ip-26-0-163-220.ec2.internal - rank : 16 (local_rank: 0) - exitcode : -6 (pid: 753288) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 753288 -============================================================ -srun: error: ip-26-0-163-220: task 2: Exited with exit code 1 -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 13] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 13] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 13] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1edf354897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1ee062dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1ee0632a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1ee0633dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1f2c0cce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1f31113609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1f30ede353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1edf354897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1ee062dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1ee0632a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1ee0633dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f1f2c0cce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f1f31113609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f1f30ede353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1edf354897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f1ee02b7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f1f2c0cce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f1f31113609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f1f30ede353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 9] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 9] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 9] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7effddaa6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7effded7fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7effded84a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7effded85dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f002a81ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f002f865609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f002f630353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7effddaa6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7effded7fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7effded84a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7effded85dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f002a81ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f002f865609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f002f630353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7effddaa6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7effdea09119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f002a81ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f002f865609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f002f630353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 8] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 15] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 8] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 15] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 8] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 15] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9174ef2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc881579897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f91761cbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc882852c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f91761d0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f91761d1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f91c1c6ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f91c6cb1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc882857a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #6: clone + 0x43 (0x7f91c6a7c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc882858dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fc8ce2f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]:frame #5: + 0x8609 (0x7fc8d3338609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]: what(): [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default0]:frame #6: clone + 0x43 (0x7fc8d3103353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9174ef2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f91761cbc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]: what(): [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f91761d0a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc881579897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc882852c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f91761d1dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc882857a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc882858dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f91c1c6ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #4: + 0xd3e95 (0x7fc8ce2f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fc8d3338609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fc8d3103353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #5: + 0x8609 (0x7f91c6cb1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc881579897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #6: clone + 0x43 (0x7f91c6a7c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #1: + 0xe32119 (0x7fc8824dc119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]: -[default0]:frame #2: + 0xd3e95 (0x7fc8ce2f1e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #3: + 0x8609 (0x7fc8d3338609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9174ef2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #4: clone + 0x43 (0x7fc8d3103353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default7]:frame #1: + 0xe32119 (0x7f9175e55119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f91c1c6ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f91c6cb1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f91c6a7c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 12] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 12] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 12] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f656d740897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f656ea19c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f656ea1ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f656ea1fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f65ba4b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f65bf4ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f65bf2ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f656d740897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f656ea19c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f656ea1ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f656ea1fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7f65ba4b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7f65bf4ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7f65bf2ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f656d740897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7f656e6a3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7f65ba4b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7f65bf4ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7f65bf2ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 11] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 11] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 11] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f3afe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1f3c2c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1f3c2c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1f3c2c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1f87d61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1f8cda8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1f8cb73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f3afe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1f3c2c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1f3c2c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1f3c2c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1f87d61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1f8cda8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1f8cb73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1f3afe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f1f3bf4c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1f87d61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f1f8cda8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f1f8cb73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 14] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 14] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 14] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9ec9b5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd9edc8ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd9edc93a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd9edc94dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fda3972de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fda3e774609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fda3e53f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600075 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9ec9b5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd9edc8ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd9edc93a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd9edc94dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fda3972de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fda3e774609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fda3e53f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd9ec9b5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fd9ed918119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fda3972de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fda3e774609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fda3e53f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 10] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 10] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 10] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9fb43a7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9fb5680c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9fb5685a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9fb5686dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa00111fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa006166609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa005f31353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9fb43a7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9fb5680c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9fb5685a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9fb5686dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7fa00111fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7fa006166609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7fa005f31353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9fb43a7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f9fb530a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7fa00111fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7fa006166609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7fa005f31353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -E0703 01:31:33.787000 139864455763776 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 3199391) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:31:33 - host : ip-26-0-163-226.ec2.internal - rank : 25 (local_rank: 1) - exitcode : -6 (pid: 3199392) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3199392 -[2]: - time : 2024-07-03_01:31:33 - host : ip-26-0-163-226.ec2.internal - rank : 26 (local_rank: 2) - exitcode : -6 (pid: 3199393) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3199393 -[3]: - time : 2024-07-03_01:31:33 - host : ip-26-0-163-226.ec2.internal - rank : 27 (local_rank: 3) - exitcode : -6 (pid: 3199394) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3199394 -[4]: - time : 2024-07-03_01:31:33 - host : ip-26-0-163-226.ec2.internal - rank : 28 (local_rank: 4) - exitcode : -6 (pid: 3199395) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3199395 -[5]: - time : 2024-07-03_01:31:33 - host : ip-26-0-163-226.ec2.internal - rank : 29 (local_rank: 5) - exitcode : -6 (pid: 3199396) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3199396 -[6]: - time : 2024-07-03_01:31:33 - host : ip-26-0-163-226.ec2.internal - rank : 30 (local_rank: 6) - exitcode : -6 (pid: 3199397) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3199397 -[7]: - time : 2024-07-03_01:31:33 - host : ip-26-0-163-226.ec2.internal - rank : 31 (local_rank: 7) - exitcode : -6 (pid: 3199398) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3199398 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:31:33 - host : ip-26-0-163-226.ec2.internal - rank : 24 (local_rank: 0) - exitcode : -6 (pid: 3199391) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 3199391 -============================================================ -srun: error: ip-26-0-163-226: task 3: Exited with exit code 1 -E0703 01:31:38.725000 140006959662912 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 503202) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:31:38 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 503203) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 503203 -[2]: - time : 2024-07-03_01:31:38 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 503204) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 503204 -[3]: - time : 2024-07-03_01:31:38 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 503205) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 503205 -[4]: - time : 2024-07-03_01:31:38 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 503206) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 503206 -[5]: - time : 2024-07-03_01:31:38 - host : ip-26-0-161-178.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 503207) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 503207 -[6]: - time : 2024-07-03_01:31:38 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 503208) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 503208 -[7]: - time : 2024-07-03_01:31:38 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 503209) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 503209 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:31:38 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 503202) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 503202 -============================================================ -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 5] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 0] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 5] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 5] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 5] Process group watchdog thread terminated with exception: [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f09eec70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f09eff49c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f09eff4ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f09eff4fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f0a3b9e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f0a40a2f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f0a407fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f92252cd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f09eec70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f92265a6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f09eff49c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f92265aba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f92265acdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f09eff4ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f9272045e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f09eff4fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f0a3b9e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f927708c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f9276e57353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #5: + 0x8609 (0x7f0a40a2f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 5] Process group watchdog thread terminated with exception: [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600093 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f92252cd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f92265a6c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f92265aba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f92265acdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f9272045e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #6: clone + 0x43 (0x7f0a407fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default5]:frame #5: + 0x8609 (0x7f927708c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f09eec70897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f09efbd3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f0a3b9e8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #6: clone + 0x43 (0x7f9276e57353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #3: + 0x8609 (0x7f0a40a2f609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f0a407fa353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]: -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f92252cd897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f9226230119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f9272045e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f927708c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f9276e57353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 4] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 4] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 4] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 4] Process group watchdog thread terminated with exception: [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe9c7f61897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe9c923ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe9c923fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe9c9240dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fea14cd9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fea19d20609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fea19aeb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 4] Process group watchdog thread terminated with exception: [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe9c7f61897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe9c923ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe9c923fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe9c9240dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fea14cd9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fea19d20609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fea19aeb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe9c7f61897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fe9c8ec4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fea14cd9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fea19d20609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fea19aeb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 6] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 6] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 6] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 6] Process group watchdog thread terminated with exception: [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe800fe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe8022c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe8022c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe8022c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe84dd61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fe852da8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe852b73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 6] Process group watchdog thread terminated with exception: [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe800fe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe8022c2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe8022c7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe8022c8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fe84dd61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fe852da8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fe852b73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe800fe9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fe801f4c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fe84dd61e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fe852da8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fe852b73353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 3] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb09b18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffb0adf1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffb0adf6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffb0adf7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ffb56890e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ffb5b8d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ffb5b6a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb09b18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ffb0adf1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ffb0adf6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ffb0adf7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7ffb56890e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7ffb5b8d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7ffb5b6a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ffb09b18897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7ffb0aa7b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7ffb56890e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7ffb5b8d7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7ffb5b6a2353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 1] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9e6afda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9e6c2b3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9e6c2b8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9e6c2b9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9eb7d52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9ebcd99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9ebcb64353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600081 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9e6afda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f9e6c2b3c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f9e6c2b8a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f9e6c2b9dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f9eb7d52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f9ebcd99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f9ebcb64353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f9e6afda897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f9e6bf3d119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f9eb7d52e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f9ebcd99609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f9ebcb64353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 7] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 7] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 7] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 7] Process group watchdog thread terminated with exception: [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4b627f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4b63acec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4b63ad3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4b63ad4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4baf56de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4bb45b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4bb437f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 7] Process group watchdog thread terminated with exception: [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4b627f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f4b63acec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f4b63ad3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f4b63ad4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f4baf56de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f4bb45b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4bb437f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f4b627f5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f4b63758119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f4baf56de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f4bb45b4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4bb437f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 2] Timeout at NCCL work: 152896, last enqueued NCCL work: 153012, last completed NCCL work: 152895. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16629eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1663cc4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1663cc9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1663ccadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f16af763e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f16b47aa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f16b4575353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=33554432, NumelOut=1048576, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16629eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1663cc4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1663cc9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1663ccadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f16af763e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f16b47aa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f16b4575353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f16629eb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f166394e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f16af763e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f16b47aa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f16b4575353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -E0703 01:41:59.381000 139740769310528 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 1115867) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:41:59 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1115868) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1115868 -[2]: - time : 2024-07-03_01:41:59 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1115869) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1115869 -[3]: - time : 2024-07-03_01:41:59 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1115870) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1115870 -[4]: - time : 2024-07-03_01:41:59 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1115871) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1115871 -[5]: - time : 2024-07-03_01:41:59 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1115872) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1115872 -[6]: - time : 2024-07-03_01:41:59 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1115873) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1115873 -[7]: - time : 2024-07-03_01:41:59 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1115874) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1115874 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:41:59 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 1115867) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1115867 -============================================================ -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -W0703 01:42:03.225000 140536230520576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_882229_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:03.424000 140005097477888 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1812068_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:03.462000 139644870969088 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1039535_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:03.735000 139619821475584 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1839790_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:04.208000 140541891254080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 882304 closing signal SIGTERM -W0703 01:42:04.208000 140541891254080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 882305 closing signal SIGTERM -W0703 01:42:04.208000 140541891254080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 882306 closing signal SIGTERM -W0703 01:42:04.210000 140541891254080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 882307 closing signal SIGTERM -W0703 01:42:04.210000 140541891254080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 882308 closing signal SIGTERM -W0703 01:42:04.210000 140541891254080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 882309 closing signal SIGTERM -W0703 01:42:04.213000 140541891254080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 882310 closing signal SIGTERM -W0703 01:42:04.213000 140541891254080 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 882311 closing signal SIGTERM -W0703 01:42:04.216000 139650531702592 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1039610 closing signal SIGTERM -W0703 01:42:04.216000 139625482209088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1839865 closing signal SIGTERM -W0703 01:42:04.217000 139650531702592 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1039611 closing signal SIGTERM -W0703 01:42:04.217000 139625482209088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1839866 closing signal SIGTERM -W0703 01:42:04.217000 139625482209088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1839867 closing signal SIGTERM -W0703 01:42:04.217000 139650531702592 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1039612 closing signal SIGTERM -W0703 01:42:04.217000 139650531702592 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1039613 closing signal SIGTERM -W0703 01:42:04.218000 139650531702592 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1039614 closing signal SIGTERM -W0703 01:42:04.217000 139625482209088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1839868 closing signal SIGTERM -W0703 01:42:04.218000 139625482209088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1839869 closing signal SIGTERM -W0703 01:42:04.218000 139650531702592 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1039615 closing signal SIGTERM -W0703 01:42:04.219000 139650531702592 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1039616 closing signal SIGTERM -W0703 01:42:04.219000 139650531702592 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1039617 closing signal SIGTERM -W0703 01:42:04.220000 139625482209088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1839870 closing signal SIGTERM -W0703 01:42:04.220000 139625482209088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1839871 closing signal SIGTERM -W0703 01:42:04.219000 140010758211392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1812143 closing signal SIGTERM -W0703 01:42:04.220000 140010758211392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1812144 closing signal SIGTERM -W0703 01:42:04.220000 140010758211392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1812145 closing signal SIGTERM -W0703 01:42:04.222000 139625482209088 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1839872 closing signal SIGTERM -W0703 01:42:04.222000 140010758211392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1812146 closing signal SIGTERM -W0703 01:42:04.222000 140010758211392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1812147 closing signal SIGTERM -W0703 01:42:04.222000 140010758211392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1812148 closing signal SIGTERM -W0703 01:42:04.223000 140010758211392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1812149 closing signal SIGTERM -W0703 01:42:04.225000 140010758211392 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1812150 closing signal SIGTERM -W0703 01:42:08.230000 140536230520576 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_882229_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:08.429000 140005097477888 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1812068_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:08.470000 139644870969088 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1039535_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:08.743000 139619821475584 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1839790_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:11.417000 140541891254080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_882229_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:11.434000 140541891254080 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_882229_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -W0703 01:42:13.433000 140005097477888 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1812068_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:13.474000 139644870969088 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1039535_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:13.747000 139619821475584 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1839790_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:16.341000 139625482209088 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1839790_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:16.362000 139625482209088 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1839790_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -W0703 01:42:17.251000 139650531702592 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1039535_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:17.268000 139650531702592 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1039535_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 01:42:17.687000 140010758211392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1812068_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:42:17.703000 140010758211392 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1812068_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-160-192_1115867.1719969970537153559.pt.trace.json: 0%| | 0.00/18.0G [00:00 $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/config.yaml deleted file mode 100644 index c5a674583e2680fad06ac49f0ad697bdfc745942..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 1 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 512 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/log.out deleted file mode 100644 index d7d49e38b5742fe37a9beacb51fe281ece04e643..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/log.out +++ /dev/null @@ -1,3450 +0,0 @@ -======================== -START TIME: Wed Jul 3 09:16:03 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 09:16:06.493000 139994058020672 torch/distributed/run.py:757] -W0703 09:16:06.493000 139994058020672 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.493000 139994058020672 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:16:06.493000 139994058020672 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.495000 140074655254336 torch/distributed/run.py:757] -W0703 09:16:06.495000 140074655254336 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.495000 140074655254336 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:16:06.495000 140074655254336 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.492000 140379220019008 torch/distributed/run.py:757] -W0703 09:16:06.492000 140379220019008 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.492000 140379220019008 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:16:06.492000 140379220019008 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.494000 139885305988928 torch/distributed/run.py:757] -W0703 09:16:06.494000 139885305988928 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.494000 139885305988928 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:16:06.494000 139885305988928 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.511000 139921262425920 torch/distributed/run.py:757] -W0703 09:16:06.511000 139921262425920 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.511000 139921262425920 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:16:06.511000 139921262425920 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.515000 140226751543104 torch/distributed/run.py:757] -W0703 09:16:06.515000 140226751543104 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.515000 140226751543104 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:16:06.515000 140226751543104 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.540000 139931029919552 torch/distributed/run.py:757] -W0703 09:16:06.540000 139931029919552 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.540000 139931029919552 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:16:06.540000 139931029919552 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.621000 140475111126848 torch/distributed/run.py:757] -W0703 09:16:06.621000 140475111126848 torch/distributed/run.py:757] ***************************************** -W0703 09:16:06.621000 140475111126848 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 09:16:06.621000 140475111126848 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 09:16:26 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=32, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=512, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512')), -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 09:16:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-147]: No checkpoint path provided. -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-147]: No checkpoint path provided. -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-147]: No checkpoint path provided. -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-147]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-147]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=19|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=18|ip-26-0-161-138]: No checkpoint path provided. -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=17|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=21|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=23|ip-26-0-161-138]: No checkpoint path provided. -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=22|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-138]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=20|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-138]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=16|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 09:16:43 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=16|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=29|ip-26-0-167-177]: No checkpoint path provided. -[default4]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=28|ip-26-0-167-177]: No checkpoint path provided. -[default0]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=24|ip-26-0-167-177]: No checkpoint path provided. -[default7]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=31|ip-26-0-167-177]: No checkpoint path provided. -[default3]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=26|ip-26-0-167-177]: No checkpoint path provided. -[default3]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=19|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=20|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=25|ip-26-0-167-177]: No checkpoint path provided. -[default1]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=17|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=18|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=21|ip-26-0-166-125]: No checkpoint path provided. -[default6]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=22|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=23|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=27|ip-26-0-167-177]: No checkpoint path provided. -[default6]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=30|ip-26-0-167-177]: No checkpoint path provided. -[default0]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=0|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=3|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=2|ip-26-0-164-207]: No checkpoint path provided. -[default1]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=1|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=6|ip-26-0-164-207]: No checkpoint path provided. -[default4]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=4|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=7|ip-26-0-164-207]: No checkpoint path provided. -[default5]:07/03/2024 09:16:44 [INFO|DP=1|PP=0|TP=5|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 09:16:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 09:16:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 09:16:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 09:16:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 09:16:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 09:16:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 09:16:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:16:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 09:16:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 09:16:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 09:16:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 09:16:47.779102 | mbs: 512 | grad_accum: 1 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 09:16:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 09:16:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=26|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=24|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=21|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=31|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=27|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=29|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=28|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=30|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=16|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=25|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=31|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=28|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=29|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=24|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=19|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=25|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=17|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=19|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=20|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=18|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=18|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:16:48 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=17|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=21|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=23|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=22|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=23|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=16|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:16:47 [WARNING|DP=0|PP=0|TP=20|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=30|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=27|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=0|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=1|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=3|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=2|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=4|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=6|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=5|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 09:16:47 [WARNING|DP=1|PP=0|TP=7|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 09:16:48 [WARNING|DP=1|PP=0|TP=26|ip-26-0-167-177]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 09:16:48 [WARNING|DP=1|PP=0|TP=22|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default5]:[rank29]: Traceback (most recent call last): -[default1]:[rank25]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: trainer.train(dataloader) -[default5]:[rank29]: trainer.train(dataloader) -[default4]:[rank28]: Traceback (most recent call last): -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: trainer.train(dataloader) -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: output = model(**micro_batch) -[default3]:[rank27]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: output = model(**micro_batch) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default2]:[rank26]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/na[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -notron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_[default0]:[rank24]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -with_hidden_states -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1[default5]:[rank29]: output = self.pp_block(**new_kwargs) -]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank9]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank9]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/to[default1]:[rank25]: return forward_call(*args, **kwargs) -rch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank25]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank10]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank10]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: sharded_logits = self.model( -[default5]:[rank29]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default4]:[rank28]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank26]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank25]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: Traceback (most recent call last): -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank11]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: Traceback (most recent call last): -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: return row_linear( -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank29]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return row_linear( -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.72 GiB is free. Including non-PyTorch memory, this process has 71.60 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: return row_linear( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.90 GiB is free. Including non-PyTorch memory, this process has 71.41 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.72 GiB is free. Including non-PyTorch memory, this process has 71.60 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.90 GiB is free. Including non-PyTorch memory, this process has 71.41 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default4]:[rank12]: sharded_logits = self.model( -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.90 GiB is free. Including non-PyTorch memory, this process has 71.41 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.p[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -y", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank31]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/f[default4]:[rank12]: return forward_call(*args, **kwargs) -sx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank31]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, [default7]:[rank15]: return forward_call(*args, **kwargs) -in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.90 GiB is free. Including non-PyTorch memory, this process has 71.41 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default5]:[rank13]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank30]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.72 GiB is free. Including non-PyTorch memory, this process has 71.60 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default3]:[rank19]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default5]:[rank13]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default7]:[rank15]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank12]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return row_linear( -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: out = F.linear(input, weight, bias) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/benc[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -h_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank17]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank17]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank23]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank23]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default5]:[rank13]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank13]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank14]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank14]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: Traceback (most recent call last): -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default4]:[rank20]: sharded_logits = self.model( -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank21]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank20]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank22]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: return row_linear( -[default6]:[rank22]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default2]:[rank18]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank18]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank18]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank41]: output = self.pp_block(**new_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank41]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank41]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank41]: return row_linear( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank41]: out = F.linear(input, weight, bias) -[default1]:[rank41]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank55]: output = self.pp_block(**new_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank55]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank55]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank55]: return row_linear( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank55]: out = F.linear(input, weight, bias) -[default7]:[rank55]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank51]: Traceback (most recent call last): -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: Traceback (most recent call last): -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: trainer.train(dataloader) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default3]:[rank51]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: trainer.train(dataloader) -[default6]:[rank54]: Traceback (most recent call last): -[default4]:[rank44]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: trainer.train(dataloader) -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: trainer.train(dataloader) -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank46]: output = self.pp_block(**new_kwargs) -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default6]:[rank46]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank44]: output = self.pp_block(**new_kwargs) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default1]:[rank49]: output = model(**micro_batch) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank46]: return row_linear( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default6]:[rank46]: out = F.linear(input, weight, bias) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank46]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank51]: sharded_logits = self.model( -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank44]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank50]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank44]: return row_linear( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank44]: out = F.linear(input, weight, bias) -[default4]:[rank44]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: sharded_logits = self.model( -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default6]:[rank54]: output = self.pp_block(**new_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.p[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -y", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: return forward_call(*args, **kwargs) -(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank47]: output = self.pp_block(**new_kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank47]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/l[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -lama.py", line 172, in forward -[default7]:[rank47]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank47]: return row_linear( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank47]: out = F.linear(input, weight, bias) -[default7]:[rank47]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank54]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank51]: output = self.pp_block(**new_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank49]: output = self.pp_block(**new_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank48]: output = self.pp_block(**new_kwargs) -[default3]:[rank51]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank54]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default0]:[rank48]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return row_linear( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: out = F.linear(input, weight, bias) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank50]: output = self.pp_block(**new_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank51]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank54]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default1]:[rank49]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default2]:[rank50]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return row_linear( -[default3]:[rank51]: return row_linear( -[default0]:[rank48]: return row_linear( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank48]: out = F.linear(input, weight, bias) -[default0]:[rank48]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank49]: out = F.linear(input, weight, bias) -[default2]:[rank50]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank49]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank51]: out = F.linear(input, weight, bias) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank50]: return row_linear( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank50]: out = F.linear(input, weight, bias) -[default2]:[rank50]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank40]: output = self.pp_block(**new_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank40]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/l[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -lama.py", line 172, in forward -[default0]:[rank40]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank40]: return row_linear( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default0]:[rank40]: out = F.linear(input, weight, bias) -[default0]:[rank40]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.p[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -y", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank42]: output = self.pp_block(**new_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank42]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10[default4]:[rank36]: return forward_call(*args, **kwargs) -[default1]:[rank33]: Traceback (most recent call last): -/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank42]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/ten[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -sor_parallel/nn.py", line 159, in forward -[default2]:[rank42]: return row_linear( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank42]: out = F.linear(input, weight, bias) -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank33]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default5]:[rank45]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank36]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default1]:[rank33]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank45]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank45]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return row_linear( -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default4]:[rank36]: out = F.linear(input, weight, bias) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank45]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank36]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.90 GiB is free. Including non-PyTorch memory, this process has 71.41 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank33]: output = self.pp_block(**new_kwargs) -[default3]:[rank35]: output = self.pp_block(**new_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default3]:[rank43]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default5]:[rank45]: return row_linear( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank45]: out = F.linear(input, weight, bias) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default1]:[rank33]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank45]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank35]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank43]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank43]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: return row_linear( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank43]: out = F.linear(input, weight, bias) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default3]:[rank43]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank35]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default1]:[rank33]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: return row_linear( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank35]: out = F.linear(input, weight, bias) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.72 GiB is free. Including non-PyTorch memory, this process has 71.60 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank33]: return row_linear( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank33]: out = F.linear(input, weight, bias) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.72 GiB is free. Including non-PyTorch memory, this process has 71.60 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/benc[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -h_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank38]: output = self.pp_block(**new_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank38]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank38]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank38]: return row_linear( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank38]: out = F.linear(input, weight, bias) -[default6]:[rank38]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.90 GiB is free. Including non-PyTorch memory, this process has 71.41 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default4]:[rank52]: output = self.pp_block(**new_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank34]: output = self.pp_block(**new_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank34]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank34]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank34]: return row_linear( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank34]: out = F.linear(input, weight, bias) -[default2]:[rank34]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.90 GiB is free. Including non-PyTorch memory, this pr[default4]:[rank52]: return forward_call(*args, **kwargs) -ocess has 71.41 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank52]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank53]: output = self.pp_block(**new_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank53]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank52]: return row_linear( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank53]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: out = F.linear(input, weight, bias) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.65 GiB is free. Including non-PyTorch memory, this process has 71.67 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank53]: return row_linear( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank53]: out = F.linear(input, weight, bias) -[default5]:[rank53]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.54 GiB is free. Including non-PyTorch memory, this process has 71.77 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank32]: output = self.pp_block(**new_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank32]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank32]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank32]: return row_linear( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank32]: out = F.linear(input, weight, bias) -[default0]:[rank32]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU -[default5]:[rank37]: Traceback (most recent call last): -[default7]:[rank39]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: output = model(**micro_batch) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: output = model(**micro_batch) -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank39]: output = self.pp_block(**new_kwargs) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank37]: output = self.pp_block(**new_kwargs) -[default7]:[rank39]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank37]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank39]: return row_linear( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank39]: out = F.linear(input, weight, bias) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.72 GiB is free. Including non-PyTorch memory, this process has 71.60 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank37]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank37]: return row_linear( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank37]: out = F.linear(input, weight, bias) -[default5]:[rank37]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 7.72 GiB is free. Including non-PyTorch memory, this process has 71.60 GiB memory in use. Of the allocated memory 61.64 GiB is allocated by PyTorch, and 229.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -W0703 09:17:12.829000 139931029919552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 944194 closing signal SIGTERM -W0703 09:17:12.830000 139931029919552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 944195 closing signal SIGTERM -W0703 09:17:12.830000 139931029919552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 944196 closing signal SIGTERM -W0703 09:17:12.830000 139931029919552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 944197 closing signal SIGTERM -W0703 09:17:12.830000 139931029919552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 944198 closing signal SIGTERM -W0703 09:17:12.830000 139931029919552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 944199 closing signal SIGTERM -W0703 09:17:12.830000 139931029919552 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 944200 closing signal SIGTERM -W0703 09:17:12.837000 139885305988928 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202898 closing signal SIGTERM -W0703 09:17:12.837000 139885305988928 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202899 closing signal SIGTERM -W0703 09:17:12.837000 139885305988928 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202901 closing signal SIGTERM -W0703 09:17:12.837000 139885305988928 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202902 closing signal SIGTERM -W0703 09:17:12.837000 139885305988928 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202903 closing signal SIGTERM -W0703 09:17:12.837000 139885305988928 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202904 closing signal SIGTERM -W0703 09:17:12.838000 139885305988928 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 202905 closing signal SIGTERM -W0703 09:17:12.838000 140379220019008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 741264 closing signal SIGTERM -W0703 09:17:12.839000 140379220019008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 741265 closing signal SIGTERM -W0703 09:17:12.839000 140379220019008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 741266 closing signal SIGTERM -W0703 09:17:12.839000 140379220019008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 741267 closing signal SIGTERM -W0703 09:17:12.839000 140379220019008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 741269 closing signal SIGTERM -W0703 09:17:12.839000 140379220019008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 741270 closing signal SIGTERM -W0703 09:17:12.839000 140379220019008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 741271 closing signal SIGTERM -W0703 09:17:12.845000 140475111126848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 469427 closing signal SIGTERM -W0703 09:17:12.845000 140475111126848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 469428 closing signal SIGTERM -W0703 09:17:12.845000 140475111126848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 469429 closing signal SIGTERM -W0703 09:17:12.846000 140475111126848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 469431 closing signal SIGTERM -W0703 09:17:12.846000 140475111126848 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 469433 closing signal SIGTERM -E0703 09:17:12.953000 139994058020672 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 856881) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:17:12 - host : ip-26-0-163-147.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 856882) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:17:12 - host : ip-26-0-163-147.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 856883) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:17:12 - host : ip-26-0-163-147.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 856884) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:17:12 - host : ip-26-0-163-147.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 856885) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:17:12 - host : ip-26-0-163-147.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 856886) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:17:12 - host : ip-26-0-163-147.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 856887) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:17:12 - host : ip-26-0-163-147.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 856888) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:17:12 - host : ip-26-0-163-147.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 856881) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-163-147: task 3: Exited with exit code 1 -E0703 09:17:14.164000 140475111126848 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 469426) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:17:12 - host : ip-26-0-164-207.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 469430) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:17:12 - host : ip-26-0-164-207.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 469432) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:17:12 - host : ip-26-0-164-207.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 469426) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:17:14.660000 139885305988928 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 2 (pid: 202900) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -srun: error: ip-26-0-164-207: task 4: Exited with exit code 1 -E0703 09:17:14.660000 140379220019008 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 4 (pid: 741268) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:17:12 - host : ip-26-0-166-125.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 202900) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:17:12 - host : ip-26-0-161-138.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 741268) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 09:17:14.757000 139931029919552 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 7 (pid: 944201) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:17:12 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 944201) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-138: task 2: Exited with exit code 1 -srun: error: ip-26-0-166-125: task 6: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 1: Exited with exit code 1 -E0703 09:17:17.968000 140226751543104 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 959058) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:17:17 - host : ip-26-0-165-24.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 959059) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_09:17:17 - host : ip-26-0-165-24.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 959060) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_09:17:17 - host : ip-26-0-165-24.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 959061) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_09:17:17 - host : ip-26-0-165-24.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 959062) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_09:17:17 - host : ip-26-0-165-24.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 959063) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_09:17:17 - host : ip-26-0-165-24.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 959064) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_09:17:17 - host : ip-26-0-165-24.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 959065) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:17:17 - host : ip-26-0-165-24.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 959058) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-165-24: task 5: Exited with exit code 1 -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 27] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 27] ProcessGroupNCCL preparing to dump debug info. -[default3]:[rank59]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 27] [PG 2 Rank 27] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 28] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 28] ProcessGroupNCCL preparing to dump debug info. -[default4]:[rank60]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 28] [PG 2 Rank 28] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 5] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 5] ProcessGroupNCCL preparing to dump debug info. -[default5]:[rank5]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 5] [PG 2 Rank 5] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 7] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 7] ProcessGroupNCCL preparing to dump debug info. -[default7]:[rank7]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 7] [PG 2 Rank 7] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 2] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=5 -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 2] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank2]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 2] [PG 2 Rank 2] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 5 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 6] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=7 -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 6] ProcessGroupNCCL preparing to dump debug info. -[default6]:[rank6]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 6] [PG 2 Rank 6] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 7 -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 3] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 3] ProcessGroupNCCL preparing to dump debug info. -[default3]:[rank3]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 3] [PG 2 Rank 3] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 1] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 1] ProcessGroupNCCL preparing to dump debug info. -[default1]:[rank1]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 1] [PG 2 Rank 1] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 25] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 25] ProcessGroupNCCL preparing to dump debug info. -[default1]:[rank57]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 25] [PG 2 Rank 25] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 4] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=3 -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 4] ProcessGroupNCCL preparing to dump debug info. -[default4]:[rank4]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 4] [PG 2 Rank 4] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 3 -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 0] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 0] ProcessGroupNCCL preparing to dump debug info. -[default0]:[rank0]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 0] [PG 2 Rank 0] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 26] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=7 -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 26] ProcessGroupNCCL preparing to dump debug info. -[default2]:[rank58]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 26] [PG 2 Rank 26] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 7 -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 24] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 24] ProcessGroupNCCL preparing to dump debug info. -[default0]:[rank56]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 24] [PG 2 Rank 24] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 31] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 31] ProcessGroupNCCL preparing to dump debug info. -[default7]:[rank63]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 31] [PG 2 Rank 31] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 29] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 29] ProcessGroupNCCL preparing to dump debug info. -[default5]:[rank61]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 29] [PG 2 Rank 29] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1316] [PG 2 Rank 30] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=4 -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:1153] [PG 2 Rank 30] ProcessGroupNCCL preparing to dump debug info. -[default6]:[rank62]:[F ProcessGroupNCCL.cpp:1169] [PG 2 Rank 30] [PG 2 Rank 30] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 4 -E0703 09:36:34.198000 140074655254336 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 73623) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 09:36:34.239000 139921262425920 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 839551) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:36:34 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 73624) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 73624 -[2]: - time : 2024-07-03_09:36:34 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 73625) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 73625 -[3]: - time : 2024-07-03_09:36:34 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 73626) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 73626 -[4]: - time : 2024-07-03_09:36:34 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 73627) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 73627 -[5]: - time : 2024-07-03_09:36:34 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 73628) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 73628 -[6]: - time : 2024-07-03_09:36:34 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 73629) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 73629 -[7]: - time : 2024-07-03_09:36:34 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 73630) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 73630 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:36:34 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 73623) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 73623 -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_09:36:34 - host : ip-26-0-167-177.ec2.internal - rank : 57 (local_rank: 1) - exitcode : -6 (pid: 839552) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 839552 -[2]: - time : 2024-07-03_09:36:34 - host : ip-26-0-167-177.ec2.internal - rank : 58 (local_rank: 2) - exitcode : -6 (pid: 839553) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 839553 -[3]: - time : 2024-07-03_09:36:34 - host : ip-26-0-167-177.ec2.internal - rank : 59 (local_rank: 3) - exitcode : -6 (pid: 839554) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 839554 -[4]: - time : 2024-07-03_09:36:34 - host : ip-26-0-167-177.ec2.internal - rank : 60 (local_rank: 4) - exitcode : -6 (pid: 839555) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 839555 -[5]: - time : 2024-07-03_09:36:34 - host : ip-26-0-167-177.ec2.internal - rank : 61 (local_rank: 5) - exitcode : -6 (pid: 839556) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 839556 -[6]: - time : 2024-07-03_09:36:34 - host : ip-26-0-167-177.ec2.internal - rank : 62 (local_rank: 6) - exitcode : -6 (pid: 839557) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 839557 -[7]: - time : 2024-07-03_09:36:34 - host : ip-26-0-167-177.ec2.internal - rank : 63 (local_rank: 7) - exitcode : -6 (pid: 839558) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 839558 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_09:36:34 - host : ip-26-0-167-177.ec2.internal - rank : 56 (local_rank: 0) - exitcode : -6 (pid: 839551) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 839551 -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -srun: error: ip-26-0-167-177: task 7: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/status.txt b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-512/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/bench.slurm b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/bench.slurm deleted file mode 100644 index d1ac12aea60c4fbbe72e48c57aa17173b3d69115..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/config.yaml deleted file mode 100644 index 144d093637804958a8a7a7a6d6cb4b32bed28c83..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 8 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 64 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/log.out deleted file mode 100644 index 06098e8c4af7464d294a7a40e3e5960d45d9f040..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/log.out +++ /dev/null @@ -1,737 +0,0 @@ -======================== -START TIME: Wed Jul 3 01:52:43 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 01:52:51.151000 139793192929088 torch/distributed/run.py:757] -W0703 01:52:51.151000 139793192929088 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.151000 139793192929088 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:52:51.151000 139793192929088 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.237000 139671689652032 torch/distributed/run.py:757] -W0703 01:52:51.237000 139671689652032 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.237000 139671689652032 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:52:51.237000 139671689652032 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.238000 140172849612608 torch/distributed/run.py:757] -W0703 01:52:51.238000 140172849612608 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.238000 140172849612608 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:52:51.238000 140172849612608 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.241000 139825456662336 torch/distributed/run.py:757] -W0703 01:52:51.241000 139825456662336 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.241000 139825456662336 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:52:51.241000 139825456662336 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.255000 140245753120576 torch/distributed/run.py:757] -W0703 01:52:51.255000 140245753120576 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.255000 140245753120576 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:52:51.255000 140245753120576 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.263000 139910532134720 torch/distributed/run.py:757] -W0703 01:52:51.263000 139910532134720 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.263000 139910532134720 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:52:51.263000 139910532134720 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.286000 139658693232448 torch/distributed/run.py:757] -W0703 01:52:51.286000 139658693232448 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.286000 139658693232448 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:52:51.286000 139658693232448 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.559000 140696109762368 torch/distributed/run.py:757] -W0703 01:52:51.559000 140696109762368 torch/distributed/run.py:757] ***************************************** -W0703 01:52:51.559000 140696109762368 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:52:51.559000 140696109762368 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 01:53:16 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=32, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=64, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=8, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64')), -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 01:53:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 01:53:35 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=19|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=24|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=21|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=18|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=23|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=16|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=17|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=22|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=20|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=26|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=27|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=31|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=25|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=29|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=28|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 01:53:35 [INFO|DP=1|PP=0|TP=30|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 01:53:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 01:53:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 01:53:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 01:53:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 01:53:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 01:53:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 01:53:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:53:38 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:53:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 01:53:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 01:53:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 01:53:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 01:53:40.687873 | mbs: 64 | grad_accum: 8 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 01:53:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 01:53:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default5]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=21|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=29|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=20|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=17|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=30|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=25|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=18|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=24|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=28|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=16|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=24|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=22|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=25|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=26|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=31|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=28|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=26|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=19|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=31|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=23|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=18|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=17|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=20|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:53:40 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=27|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:53:40 [WARNING|DP=1|PP=0|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=19|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=27|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=29|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:53:41 [WARNING|DP=0|PP=0|TP=22|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=21|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=16|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:53:41 [WARNING|DP=1|PP=0|TP=30|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:53:46 [WARNING|DP=0|PP=0|TP=23|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:53:56 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default6]:07/03/2024 01:53:56 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default6]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default6]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default2]:07/03/2024 01:54:24 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:54:24 [WARNING|DP=1|PP=0|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:07/03/2024 01:54:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 369.84MiB. Peak allocated 66816.90MiB. Peak reserved: 67676.00MiB -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default0]:07/03/2024 01:54:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 59.9K | tokens_per_sec: 70.1K | tokens_per_sec_per_gpu: 1.09K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 0.0001 | model_tflops_per_gpu: 9.93 | hardware_tflops_per_gpu: 9.93 | grad_norm: 11.1 | cuda_memory_allocated: 527M | cuda_max_memory_reserved: 71G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 01:54:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 668.39MiB. Peak reserved: 67730.00MiB -[default0]:07/03/2024 01:54:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:54:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 9.01K | tokens_per_sec: 465K | tokens_per_sec_per_gpu: 7.27K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 9.53e-05 | model_tflops_per_gpu: 66 | hardware_tflops_per_gpu: 66 | grad_norm: 11.1 | cuda_memory_allocated: 527M | cuda_max_memory_reserved: 71.1G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 01:54:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 668.39MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:54:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:54:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 8.85K | tokens_per_sec: 474K | tokens_per_sec_per_gpu: 7.4K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.05e-05 | model_tflops_per_gpu: 67.2 | hardware_tflops_per_gpu: 67.2 | grad_norm: 76 | cuda_memory_allocated: 527M | cuda_max_memory_reserved: 71.1G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 01:54:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 668.39MiB. Peak reserved: 67796.00MiB -[default0]:STAGE:2024-07-03 01:54:58 1120367:1120367 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 01:55:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:55:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 8.89K | tokens_per_sec: 472K | tokens_per_sec_per_gpu: 7.37K | global_batch_size: 1.02K | lm_loss: 11.5 | lr: 8.58e-05 | model_tflops_per_gpu: 66.9 | hardware_tflops_per_gpu: 66.9 | grad_norm: 15.1 | cuda_memory_allocated: 527M | cuda_max_memory_reserved: 71.1G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 01:55:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 668.39MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:55:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 8.84K | tokens_per_sec: 474K | tokens_per_sec_per_gpu: 7.41K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 8.11e-05 | model_tflops_per_gpu: 67.3 | hardware_tflops_per_gpu: 67.3 | grad_norm: 24.2 -[default0]:07/03/2024 01:55:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:55:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 8.89K | tokens_per_sec: 472K | tokens_per_sec_per_gpu: 7.37K | global_batch_size: 1.02K | lm_loss: 10.1 | lr: 7.63e-05 | model_tflops_per_gpu: 66.9 | hardware_tflops_per_gpu: 66.9 | grad_norm: 12.8 -[default0]:STAGE:2024-07-03 01:55:29 1120367:1120367 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 01:55:30 1120367:1120367 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default0]:07/03/2024 01:56:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:56:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 7 / 20 | consumed_tokens: 29.4M | elapsed_time_per_iteration_ms: 8.8K | tokens_per_sec: 477K | tokens_per_sec_per_gpu: 7.45K | global_batch_size: 1.02K | lm_loss: 9.9 | lr: 7.16e-05 | model_tflops_per_gpu: 67.6 | hardware_tflops_per_gpu: 67.6 | grad_norm: 9.27 -[default0]:07/03/2024 01:56:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:56:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 8 / 20 | consumed_tokens: 33.6M | elapsed_time_per_iteration_ms: 8.84K | tokens_per_sec: 474K | tokens_per_sec_per_gpu: 7.41K | global_batch_size: 1.02K | lm_loss: 9.56 | lr: 6.68e-05 | model_tflops_per_gpu: 67.2 | hardware_tflops_per_gpu: 67.2 | grad_norm: 6.92 -[default0]:07/03/2024 01:56:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:56:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 9 / 20 | consumed_tokens: 37.7M | elapsed_time_per_iteration_ms: 8.87K | tokens_per_sec: 473K | tokens_per_sec_per_gpu: 7.39K | global_batch_size: 1.02K | lm_loss: 9.2 | lr: 6.21e-05 | model_tflops_per_gpu: 67 | hardware_tflops_per_gpu: 67 | grad_norm: 6.51 -[default0]:07/03/2024 01:56:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:56:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 10 / 20 | consumed_tokens: 41.9M | elapsed_time_per_iteration_ms: 8.87K | tokens_per_sec: 473K | tokens_per_sec_per_gpu: 7.39K | global_batch_size: 1.02K | lm_loss: 8.88 | lr: 5.74e-05 | model_tflops_per_gpu: 67.1 | hardware_tflops_per_gpu: 67.1 | grad_norm: 5.55 -[default0]:07/03/2024 01:56:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:56:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 11 / 20 | consumed_tokens: 46.1M | elapsed_time_per_iteration_ms: 8.84K | tokens_per_sec: 474K | tokens_per_sec_per_gpu: 7.41K | global_batch_size: 1.02K | lm_loss: 8.69 | lr: 5.26e-05 | model_tflops_per_gpu: 67.2 | hardware_tflops_per_gpu: 67.2 | grad_norm: 5.87 -[default0]:07/03/2024 01:56:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:56:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 12 / 20 | consumed_tokens: 50.3M | elapsed_time_per_iteration_ms: 8.84K | tokens_per_sec: 475K | tokens_per_sec_per_gpu: 7.42K | global_batch_size: 1.02K | lm_loss: 8.48 | lr: 4.79e-05 | model_tflops_per_gpu: 67.3 | hardware_tflops_per_gpu: 67.3 | grad_norm: 5.82 -[default0]:07/03/2024 01:56:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:57:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 13 / 20 | consumed_tokens: 54.5M | elapsed_time_per_iteration_ms: 8.85K | tokens_per_sec: 474K | tokens_per_sec_per_gpu: 7.4K | global_batch_size: 1.02K | lm_loss: 8.25 | lr: 4.32e-05 | model_tflops_per_gpu: 67.2 | hardware_tflops_per_gpu: 67.2 | grad_norm: 5.08 -[default0]:07/03/2024 01:57:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:57:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 14 / 20 | consumed_tokens: 58.7M | elapsed_time_per_iteration_ms: 8.89K | tokens_per_sec: 472K | tokens_per_sec_per_gpu: 7.37K | global_batch_size: 1.02K | lm_loss: 8.1 | lr: 3.84e-05 | model_tflops_per_gpu: 66.9 | hardware_tflops_per_gpu: 66.9 | grad_norm: 5.08 -[default0]:07/03/2024 01:57:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:57:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 15 / 20 | consumed_tokens: 62.9M | elapsed_time_per_iteration_ms: 8.86K | tokens_per_sec: 473K | tokens_per_sec_per_gpu: 7.39K | global_batch_size: 1.02K | lm_loss: 7.99 | lr: 3.37e-05 | model_tflops_per_gpu: 67.1 | hardware_tflops_per_gpu: 67.1 | grad_norm: 5.11 -[default0]:07/03/2024 01:57:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:57:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 16 / 20 | consumed_tokens: 67.1M | elapsed_time_per_iteration_ms: 8.87K | tokens_per_sec: 473K | tokens_per_sec_per_gpu: 7.39K | global_batch_size: 1.02K | lm_loss: 7.9 | lr: 2.89e-05 | model_tflops_per_gpu: 67 | hardware_tflops_per_gpu: 67 | grad_norm: 5.13 -[default0]:07/03/2024 01:57:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:57:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 17 / 20 | consumed_tokens: 71.3M | elapsed_time_per_iteration_ms: 8.87K | tokens_per_sec: 473K | tokens_per_sec_per_gpu: 7.39K | global_batch_size: 1.02K | lm_loss: 7.78 | lr: 2.42e-05 | model_tflops_per_gpu: 67 | hardware_tflops_per_gpu: 67 | grad_norm: 4.9 -[default0]:07/03/2024 01:57:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:57:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 18 / 20 | consumed_tokens: 75.5M | elapsed_time_per_iteration_ms: 8.85K | tokens_per_sec: 474K | tokens_per_sec_per_gpu: 7.4K | global_batch_size: 1.02K | lm_loss: 7.67 | lr: 1.95e-05 | model_tflops_per_gpu: 67.2 | hardware_tflops_per_gpu: 67.2 | grad_norm: 4.65 -[default0]:07/03/2024 01:57:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:58:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 19 / 20 | consumed_tokens: 79.7M | elapsed_time_per_iteration_ms: 8.85K | tokens_per_sec: 474K | tokens_per_sec_per_gpu: 7.4K | global_batch_size: 1.02K | lm_loss: 7.58 | lr: 1.47e-05 | model_tflops_per_gpu: 67.2 | hardware_tflops_per_gpu: 67.2 | grad_norm: 4.55 -[default0]:07/03/2024 01:58:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 502.57MiB. Peak allocated 66949.64MiB. Peak reserved: 67796.00MiB -[default0]:07/03/2024 01:58:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 20 / 20 | consumed_tokens: 83.9M | elapsed_time_per_iteration_ms: 8.87K | tokens_per_sec: 473K | tokens_per_sec_per_gpu: 7.39K | global_batch_size: 1.02K | lm_loss: 7.53 | lr: 1e-05 | model_tflops_per_gpu: 67 | hardware_tflops_per_gpu: 67 | grad_norm: 4.51 -Saved 1 csv files over 1 completed logs -Processing file: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/profiler/ip-26-0-160-192_1120367.1719971759042826136.pt.trace.json -Results written to /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-64/profiler.csv -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-160-192_1120367.1719971759042826136.pt.trace.json: 0%| | 0.00/1.17G [00:00 $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8 llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8 --commit-message "Upload llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/config.yaml b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/config.yaml deleted file mode 100644 index 895d70358e87293c2ae520b90b217925611ade8d..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 2 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 32 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 64 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 8 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/log.out b/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/log.out deleted file mode 100644 index 8ec0583c36aaefe18b7717065ad8613f24fe67b3..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/log.out +++ /dev/null @@ -1,737 +0,0 @@ -======================== -START TIME: Wed Jul 3 07:14:44 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 07:14:49.690000 140370916841280 torch/distributed/run.py:757] -W0703 07:14:49.690000 140370916841280 torch/distributed/run.py:757] ***************************************** -W0703 07:14:49.690000 140370916841280 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:14:49.690000 140370916841280 torch/distributed/run.py:757] ***************************************** -W0703 07:14:49.856000 139681756510016 torch/distributed/run.py:757] -W0703 07:14:49.856000 139681756510016 torch/distributed/run.py:757] ***************************************** -W0703 07:14:49.856000 139681756510016 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:14:49.856000 139681756510016 torch/distributed/run.py:757] ***************************************** -W0703 07:14:49.905000 140534538422080 torch/distributed/run.py:757] -W0703 07:14:49.905000 140534538422080 torch/distributed/run.py:757] ***************************************** -W0703 07:14:49.905000 140534538422080 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:14:49.905000 140534538422080 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.095000 140157070346048 torch/distributed/run.py:757] -W0703 07:14:50.095000 140157070346048 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.095000 140157070346048 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:14:50.095000 140157070346048 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.202000 140644929955648 torch/distributed/run.py:757] -W0703 07:14:50.202000 140644929955648 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.202000 140644929955648 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:14:50.202000 140644929955648 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.305000 140456829589312 torch/distributed/run.py:757] -W0703 07:14:50.305000 140456829589312 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.305000 140456829589312 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:14:50.305000 140456829589312 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.333000 140430705157952 torch/distributed/run.py:757] -W0703 07:14:50.333000 140430705157952 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.333000 140430705157952 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:14:50.333000 140430705157952 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.446000 140699014350656 torch/distributed/run.py:757] -W0703 07:14:50.446000 140699014350656 torch/distributed/run.py:757] ***************************************** -W0703 07:14:50.446000 140699014350656 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 07:14:50.446000 140699014350656 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 07:15:15 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=2, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=32, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=8, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=64, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8')), -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 07:15:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=30|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=25|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=24|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=26|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=29|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=28|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=27|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2122.50MiB) -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=31|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default5]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=21|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=23|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=17|ip-26-0-163-220]: No checkpoint path provided. -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default1]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default2]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=18|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 34.8M (66.33MiB) -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default7]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default0]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=16|ip-26-0-163-220]: No checkpoint path provided. -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default6]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=22|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: Local number of parameters: 34.8M (66.33MiB) -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: [After model building] Memory usage: 90.35MiB. Peak allocated: 98.52MiB Peak reserved: 108.00MiB -[default4]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=20|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 07:15:33 [INFO|DP=0|PP=0|TP=19|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=18|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=23|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=24|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=28|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=21|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=17|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=20|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=19|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=22|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=25|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=27|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=30|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=26|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=31|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=29|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=16|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 07:15:34 [INFO|DP=1|PP=0|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 07:15:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 07:15:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 07:15:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 07:15:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 17.4M out of 34.8M (50.00%) params' optimizer states -[default0]:07/03/2024 07:15:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 07:15:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 07:15:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 07:15:37 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:15:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 07:15:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 07:15:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 07:15:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 07:15:39.211627 | mbs: 8 | grad_accum: 64 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 07:15:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 07:15:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 289.33MiB. Peak allocated 289.33MiB. Peak reserved: 310.00MiB -[default0]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=24|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=30|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=26|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=29|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=27|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=20|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=17|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=19|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=17|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=22|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=19|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=31|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=25|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=16|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=18|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=30|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=29|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=23|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=25|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=24|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=28|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=31|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=21|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=22|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=23|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=21|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=16|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=18|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=20|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=26|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=28|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:15:39 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 07:15:39 [WARNING|DP=1|PP=0|TP=27|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 07:15:40 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default2]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:07/03/2024 07:16:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 365.93MiB. Peak allocated 8671.59MiB. Peak reserved: 9074.00MiB -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default0]:07/03/2024 07:16:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 29.3K | tokens_per_sec: 143K | tokens_per_sec_per_gpu: 2.24K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 0.0001 | model_tflops_per_gpu: 20.3 | hardware_tflops_per_gpu: 20.3 | grad_norm: 11.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 9.57G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 07:16:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 664.98MiB. Peak reserved: 9130.00MiB -[default0]:07/03/2024 07:16:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.69MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:16:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 12.4K | tokens_per_sec: 339K | tokens_per_sec_per_gpu: 5.3K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 9.53e-05 | model_tflops_per_gpu: 48.1 | hardware_tflops_per_gpu: 48.1 | grad_norm: 11.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 9.64G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 07:16:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 665.02MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:16:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.69MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:16:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 12.1K | tokens_per_sec: 348K | tokens_per_sec_per_gpu: 5.43K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.05e-05 | model_tflops_per_gpu: 49.3 | hardware_tflops_per_gpu: 49.3 | grad_norm: 76 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 9.64G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 07:16:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 665.02MiB. Peak reserved: 9194.00MiB -[default0]:STAGE:2024-07-03 07:16:33 1179034:1179034 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 07:16:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.69MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:16:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 13.9K | tokens_per_sec: 301K | tokens_per_sec_per_gpu: 4.71K | global_batch_size: 1.02K | lm_loss: 11.5 | lr: 8.58e-05 | model_tflops_per_gpu: 42.7 | hardware_tflops_per_gpu: 42.7 | grad_norm: 15.1 | cuda_memory_allocated: 523M | cuda_max_memory_reserved: 9.64G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 07:16:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 665.02MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:17:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 13.7K | tokens_per_sec: 305K | tokens_per_sec_per_gpu: 4.77K | global_batch_size: 1.02K | lm_loss: 11.6 | lr: 8.11e-05 | model_tflops_per_gpu: 43.3 | hardware_tflops_per_gpu: 43.3 | grad_norm: 24.2 -[default0]:07/03/2024 07:17:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:17:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 13.8K | tokens_per_sec: 305K | tokens_per_sec_per_gpu: 4.76K | global_batch_size: 1.02K | lm_loss: 10.1 | lr: 7.63e-05 | model_tflops_per_gpu: 43.2 | hardware_tflops_per_gpu: 43.2 | grad_norm: 12.8 -[default0]:STAGE:2024-07-03 07:17:51 1179034:1179034 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 07:17:54 1179034:1179034 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default0]:07/03/2024 07:22:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:22:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 7 / 20 | consumed_tokens: 29.4M | elapsed_time_per_iteration_ms: 12.2K | tokens_per_sec: 344K | tokens_per_sec_per_gpu: 5.38K | global_batch_size: 1.02K | lm_loss: 9.9 | lr: 7.16e-05 | model_tflops_per_gpu: 48.8 | hardware_tflops_per_gpu: 48.8 | grad_norm: 9.26 -[default0]:07/03/2024 07:22:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:22:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 8 / 20 | consumed_tokens: 33.6M | elapsed_time_per_iteration_ms: 12.1K | tokens_per_sec: 345K | tokens_per_sec_per_gpu: 5.4K | global_batch_size: 1.02K | lm_loss: 9.56 | lr: 6.68e-05 | model_tflops_per_gpu: 49 | hardware_tflops_per_gpu: 49 | grad_norm: 6.93 -[default0]:07/03/2024 07:22:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:23:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 9 / 20 | consumed_tokens: 37.7M | elapsed_time_per_iteration_ms: 12.3K | tokens_per_sec: 340K | tokens_per_sec_per_gpu: 5.32K | global_batch_size: 1.02K | lm_loss: 9.2 | lr: 6.21e-05 | model_tflops_per_gpu: 48.3 | hardware_tflops_per_gpu: 48.3 | grad_norm: 6.51 -[default0]:07/03/2024 07:23:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:23:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 10 / 20 | consumed_tokens: 41.9M | elapsed_time_per_iteration_ms: 14.4K | tokens_per_sec: 290K | tokens_per_sec_per_gpu: 4.54K | global_batch_size: 1.02K | lm_loss: 8.88 | lr: 5.74e-05 | model_tflops_per_gpu: 41.2 | hardware_tflops_per_gpu: 41.2 | grad_norm: 5.56 -[default0]:07/03/2024 07:23:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:23:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 11 / 20 | consumed_tokens: 46.1M | elapsed_time_per_iteration_ms: 12.2K | tokens_per_sec: 344K | tokens_per_sec_per_gpu: 5.38K | global_batch_size: 1.02K | lm_loss: 8.69 | lr: 5.26e-05 | model_tflops_per_gpu: 48.8 | hardware_tflops_per_gpu: 48.8 | grad_norm: 5.87 -[default0]:07/03/2024 07:23:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:23:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 12 / 20 | consumed_tokens: 50.3M | elapsed_time_per_iteration_ms: 12.1K | tokens_per_sec: 347K | tokens_per_sec_per_gpu: 5.42K | global_batch_size: 1.02K | lm_loss: 8.48 | lr: 4.79e-05 | model_tflops_per_gpu: 49.2 | hardware_tflops_per_gpu: 49.2 | grad_norm: 5.82 -[default0]:07/03/2024 07:23:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:23:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 13 / 20 | consumed_tokens: 54.5M | elapsed_time_per_iteration_ms: 12.1K | tokens_per_sec: 347K | tokens_per_sec_per_gpu: 5.42K | global_batch_size: 1.02K | lm_loss: 8.25 | lr: 4.32e-05 | model_tflops_per_gpu: 49.2 | hardware_tflops_per_gpu: 49.2 | grad_norm: 5.08 -[default0]:07/03/2024 07:23:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:24:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 14 / 20 | consumed_tokens: 58.7M | elapsed_time_per_iteration_ms: 12.1K | tokens_per_sec: 347K | tokens_per_sec_per_gpu: 5.43K | global_batch_size: 1.02K | lm_loss: 8.1 | lr: 3.84e-05 | model_tflops_per_gpu: 49.2 | hardware_tflops_per_gpu: 49.2 | grad_norm: 5.09 -[default0]:07/03/2024 07:24:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:24:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 15 / 20 | consumed_tokens: 62.9M | elapsed_time_per_iteration_ms: 12.1K | tokens_per_sec: 348K | tokens_per_sec_per_gpu: 5.43K | global_batch_size: 1.02K | lm_loss: 7.99 | lr: 3.37e-05 | model_tflops_per_gpu: 49.3 | hardware_tflops_per_gpu: 49.3 | grad_norm: 5.11 -[default0]:07/03/2024 07:24:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:24:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 16 / 20 | consumed_tokens: 67.1M | elapsed_time_per_iteration_ms: 12.1K | tokens_per_sec: 347K | tokens_per_sec_per_gpu: 5.43K | global_batch_size: 1.02K | lm_loss: 7.9 | lr: 2.89e-05 | model_tflops_per_gpu: 49.2 | hardware_tflops_per_gpu: 49.2 | grad_norm: 5.13 -[default0]:07/03/2024 07:24:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:24:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 17 / 20 | consumed_tokens: 71.3M | elapsed_time_per_iteration_ms: 12.1K | tokens_per_sec: 348K | tokens_per_sec_per_gpu: 5.43K | global_batch_size: 1.02K | lm_loss: 7.78 | lr: 2.42e-05 | model_tflops_per_gpu: 49.3 | hardware_tflops_per_gpu: 49.3 | grad_norm: 4.9 -[default0]:07/03/2024 07:24:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:24:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 18 / 20 | consumed_tokens: 75.5M | elapsed_time_per_iteration_ms: 12.1K | tokens_per_sec: 347K | tokens_per_sec_per_gpu: 5.42K | global_batch_size: 1.02K | lm_loss: 7.67 | lr: 1.95e-05 | model_tflops_per_gpu: 49.2 | hardware_tflops_per_gpu: 49.2 | grad_norm: 4.65 -[default0]:07/03/2024 07:24:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:25:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 19 / 20 | consumed_tokens: 79.7M | elapsed_time_per_iteration_ms: 14.1K | tokens_per_sec: 297K | tokens_per_sec_per_gpu: 4.65K | global_batch_size: 1.02K | lm_loss: 7.59 | lr: 1.47e-05 | model_tflops_per_gpu: 42.2 | hardware_tflops_per_gpu: 42.2 | grad_norm: 4.55 -[default0]:07/03/2024 07:25:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 498.66MiB. Peak allocated 8804.35MiB. Peak reserved: 9194.00MiB -[default0]:07/03/2024 07:25:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 20 / 20 | consumed_tokens: 83.9M | elapsed_time_per_iteration_ms: 12.3K | tokens_per_sec: 340K | tokens_per_sec_per_gpu: 5.32K | global_batch_size: 1.02K | lm_loss: 7.53 | lr: 1e-05 | model_tflops_per_gpu: 48.2 | hardware_tflops_per_gpu: 48.2 | grad_norm: 4.51 -W0703 07:26:22.488000 140699014350656 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1874742_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:26:22.490000 140644929955648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1901718_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:26:22.494000 140699014350656 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1874742_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 07:26:22.497000 140644929955648 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1901718_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Saved 1 csv files over 1 completed logs -Processing file: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/profiler/ip-26-0-160-192_1179034.1719991285734442922.pt.trace.json -Results written to /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-2_tp-32_pp-1_mbz-8/profiler.csv -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-160-192_1179034.1719991285734442922.pt.trace.json: 0%| | 0.00/9.03G [00:00 $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1 llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1 --commit-message "Upload llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/config.yaml b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/config.yaml deleted file mode 100644 index 764af9433810e42b385f3c1ebe50e7814c1bfd48..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 4 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 256 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 1 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/log.out b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/log.out deleted file mode 100644 index bb8e04c02d2d6d59b7d0ea7d6bf52832d10cce7b..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1/log.out +++ /dev/null @@ -1,1801 +0,0 @@ -======================== -START TIME: Wed Jul 3 00:33:48 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 00:33:55.346000 139984549480256 torch/distributed/run.py:757] -W0703 00:33:55.346000 139984549480256 torch/distributed/run.py:757] ***************************************** -W0703 00:33:55.346000 139984549480256 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:33:55.346000 139984549480256 torch/distributed/run.py:757] ***************************************** -W0703 00:33:55.347000 139945336207168 torch/distributed/run.py:757] -W0703 00:33:55.347000 139945336207168 torch/distributed/run.py:757] ***************************************** -W0703 00:33:55.347000 139945336207168 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:33:55.347000 139945336207168 torch/distributed/run.py:757] ***************************************** -W0703 00:33:55.346000 139720846501696 torch/distributed/run.py:757] -W0703 00:33:55.346000 139720846501696 torch/distributed/run.py:757] ***************************************** -W0703 00:33:55.346000 139720846501696 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:33:55.346000 139720846501696 torch/distributed/run.py:757] ***************************************** -W0703 00:33:55.527000 139674419038016 torch/distributed/run.py:757] -W0703 00:33:55.527000 139674419038016 torch/distributed/run.py:757] ***************************************** -W0703 00:33:55.527000 139674419038016 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:33:55.527000 139674419038016 torch/distributed/run.py:757] ***************************************** -W0703 00:33:55.578000 140372375672640 torch/distributed/run.py:757] -W0703 00:33:55.578000 140372375672640 torch/distributed/run.py:757] ***************************************** -W0703 00:33:55.578000 140372375672640 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:33:55.578000 140372375672640 torch/distributed/run.py:757] ***************************************** -W0703 00:33:56.312000 139892035856192 torch/distributed/run.py:757] -W0703 00:33:56.312000 139892035856192 torch/distributed/run.py:757] ***************************************** -W0703 00:33:56.312000 139892035856192 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:33:56.312000 139892035856192 torch/distributed/run.py:757] ***************************************** -W0703 00:33:56.330000 140212278097728 torch/distributed/run.py:757] -W0703 00:33:56.330000 140212278097728 torch/distributed/run.py:757] ***************************************** -W0703 00:33:56.330000 140212278097728 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:33:56.330000 140212278097728 torch/distributed/run.py:757] ***************************************** -W0703 00:33:56.380000 140059295917888 torch/distributed/run.py:757] -W0703 00:33:56.380000 140059295917888 torch/distributed/run.py:757] ***************************************** -W0703 00:33:56.380000 140059295917888 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 00:33:56.380000 140059295917888 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 00:34:22 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=4, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=16, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=256, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-1')), -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 00:34:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default5]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2119.44MiB) -[default0]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default6]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 00:34:40 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 00:34:40 [INFO|DP=2|PP=0|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=2|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=3|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=4|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=7|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=6|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=5|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=1|ip-26-0-163-226]: No checkpoint path provided. -[default1]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 00:34:40 [INFO|DP=1|PP=0|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=7|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=12|ip-26-0-172-73]: No checkpoint path provided. -[default6]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=14|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=13|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=10|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=8|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=11|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=9|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=3|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=0|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=4|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=2|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=15|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=5|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=1|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 00:34:40 [INFO|DP=3|PP=0|TP=6|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 00:34:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 00:34:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 00:34:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 00:34:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 00:34:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 2 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 00:34:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 3 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 00:34:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 00:34:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 00:34:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 00:34:43 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:34:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 00:34:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 00:34:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 00:34:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 00:34:45.975805 | mbs: 1 | grad_accum: 256 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 00:34:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 00:34:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 490.87MiB. Peak allocated 490.87MiB. Peak reserved: 512.00MiB -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=4|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=7|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=5|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=12|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=13|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=14|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=11|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=9|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=15|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=4|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=2|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=5|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=1|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=6|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=7|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=2|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=3|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=1|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=8|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=0|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=3|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 00:34:46 [WARNING|DP=2|PP=0|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:34:46 [WARNING|DP=1|PP=0|TP=6|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 00:34:46 [WARNING|DP=3|PP=0|TP=10|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 00:34:46 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 00:34:51 [WARNING|DP=2|PP=0|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:07/03/2024 00:35:13 [WARNING|DP=2|PP=0|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:07/03/2024 00:35:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 567.07MiB. Peak allocated 2004.80MiB. Peak reserved: 2110.00MiB -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:07/03/2024 00:35:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 72.9K | tokens_per_sec: 57.5K | tokens_per_sec_per_gpu: 899 | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 0.0001 | model_tflops_per_gpu: 8.15 | hardware_tflops_per_gpu: 8.15 | grad_norm: 11.5 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 2.26G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 00:35:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 699.61MiB. Peak allocated 997.65MiB. Peak reserved: 2152.00MiB -[default0]:07/03/2024 00:36:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 699.73MiB. Peak allocated 1984.93MiB. Peak reserved: 2230.00MiB -[default0]:07/03/2024 00:36:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 38.3K | tokens_per_sec: 110K | tokens_per_sec_per_gpu: 1.71K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.53e-05 | model_tflops_per_gpu: 15.5 | hardware_tflops_per_gpu: 15.5 | grad_norm: 11.6 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 2.34G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 00:36:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 699.61MiB. Peak allocated 997.78MiB. Peak reserved: 2230.00MiB -[default0]:STAGE:2024-07-03 00:37:20 1111136:1111136 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 00:37:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 699.73MiB. Peak allocated 1984.93MiB. Peak reserved: 2230.00MiB -[default0]:07/03/2024 00:37:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 43.3K | tokens_per_sec: 96.8K | tokens_per_sec_per_gpu: 1.51K | global_batch_size: 1.02K | lm_loss: 11.9 | lr: 9.05e-05 | model_tflops_per_gpu: 13.7 | hardware_tflops_per_gpu: 13.7 | grad_norm: 122 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 2.34G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 00:37:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 699.61MiB. Peak allocated 997.78MiB. Peak reserved: 2230.00MiB -[default0]:07/03/2024 00:38:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 699.73MiB. Peak allocated 1984.93MiB. Peak reserved: 2230.00MiB -[default0]:07/03/2024 00:38:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 57.5K | tokens_per_sec: 72.9K | tokens_per_sec_per_gpu: 1.14K | global_batch_size: 1.02K | lm_loss: 12.3 | lr: 8.58e-05 | model_tflops_per_gpu: 10.3 | hardware_tflops_per_gpu: 10.3 | grad_norm: 18.3 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 2.34G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/03/2024 00:38:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 699.61MiB. Peak allocated 997.78MiB. Peak reserved: 2230.00MiB -[default0]:07/03/2024 00:39:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 57.9K | tokens_per_sec: 72.4K | tokens_per_sec_per_gpu: 1.13K | global_batch_size: 1.02K | lm_loss: 11.2 | lr: 8.11e-05 | model_tflops_per_gpu: 10.3 | hardware_tflops_per_gpu: 10.3 | grad_norm: 29 -[default0]:07/03/2024 00:39:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 699.61MiB. Peak allocated 1984.93MiB. Peak reserved: 2230.00MiB -[default0]:07/03/2024 00:40:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 55.6K | tokens_per_sec: 75.4K | tokens_per_sec_per_gpu: 1.18K | global_batch_size: 1.02K | lm_loss: 10.2 | lr: 7.63e-05 | model_tflops_per_gpu: 10.7 | hardware_tflops_per_gpu: 10.7 | grad_norm: 10.4 -[default0]:STAGE:2024-07-03 00:42:41 1111136:1111136 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 00:42:56 1111136:1111136 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600064 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600082 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600018 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600073 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600086 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600071 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356672, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600063 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600080 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600078 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600088 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356624, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 602296 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356619, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602624 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600039 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356590, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 602551 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600009 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600055 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600013 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356653, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602800 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356669, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602813 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356665, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602808 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600095 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356623, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602940 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356573, OpType=ALLREDUCE, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 602765 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356668, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 602948 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356633, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 603041 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356648, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 602996 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356627, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 603000 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356635, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 603107 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356670, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 603072 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356669, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 603072 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356574, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 602917 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356572, OpType=ALLREDUCE, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 603069 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356634, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 603200 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356572, OpType=ALLREDUCE, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 603021 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356575, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 603065 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356573, OpType=ALLREDUCE, NumelIn=4096, NumelOut=4096, Timeout(ms)=600000) ran for 603083 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356638, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 603105 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356545, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 603198 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356671, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602025 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356575, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602153 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356643, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602455 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356610, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 602451 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356607, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602745 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356549, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 602846 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356568, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602870 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356574, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 602912 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356588, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 602881 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356611, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602893 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356625, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602913 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356611, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602868 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356611, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 602997 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356611, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 603015 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356606, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 603060 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=356623, OpType=_ALLGATHER_BASE, NumelIn=524288, NumelOut=8388608, Timeout(ms)=600000) ran for 603040 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 11] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 11] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 11] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f626605f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6267338c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f626733da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f626733edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f62b2dd7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f62b7e1e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f62b7be9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600006 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f626605f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6267338c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f626733da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f626733edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f62b2dd7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f62b7e1e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f62b7be9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f626605f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f6266fc2119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f62b2dd7e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f62b7e1e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f62b7be9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 15] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 15] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 15] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f223e6d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f223f9afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f223f9b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f223f9b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f228b44ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2290495609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2290260353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600017 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f223e6d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f223f9afc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f223f9b4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f223f9b5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f228b44ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f2290495609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f2290260353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f223e6d6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f223f639119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f228b44ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f2290495609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f2290260353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 10] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 10] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 10] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 12] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 12] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 12] To avoid data inconsistency, we are taking the entire process down. -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f470e6ad897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 9] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 9] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 8] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 8] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 9] To avoid data inconsistency, we are taking the entire process down. -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f470f986c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f470f98ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 8] To avoid data inconsistency, we are taking the entire process down. -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f470f98cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff85f9e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default2]:frame #4: + 0xd3e95 (0x7f475b425e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff860cbdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #5: + 0x8609 (0x7f476046c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff860cc2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f509da3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #6: clone + 0x43 (0x7f4760237353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff860cc3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f509ed15c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f64819cb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #4: + 0xd3e95 (0x7ff8ac75ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f509ed1aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6482ca4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600070 milliseconds before timing out. -[default4]:frame #5: + 0x8609 (0x7ff8b17a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f509ed1bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #6: clone + 0x43 (0x7ff8b156e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #4: + 0xd3e95 (0x7f50ea7b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]: -[default0]:frame #5: + 0x8609 (0x7f50ef7fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f50ef5c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6482ca9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6482caadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]: what(): [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f470e6ad897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f470f986c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f64ce743e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]: what(): [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600054 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #5: + 0x8609 (0x7f64d378a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f470f98ba80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff85f9e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f470f98cdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7ff860cbdc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f475b425e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f509da3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7ff860cc2a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #6: clone + 0x43 (0x7f64d3555353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:frame #5: + 0x8609 (0x7f476046c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f509ed15c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f509ed1aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f509ed1bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7ff860cc3dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]: what(): [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600011 milliseconds before timing out. -[default0]:frame #4: + 0xd3e95 (0x7f50ea7b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #4: + 0xd3e95 (0x7ff8ac75ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #6: clone + 0x43 (0x7f4760237353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #5: + 0x8609 (0x7f50ef7fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]: -[default0]:frame #6: clone + 0x43 (0x7f50ef5c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default4]:frame #5: + 0x8609 (0x7ff8b17a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f509da3c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #6: clone + 0x43 (0x7ff8b156e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]: -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f470e6ad897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f64819cb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #1: + 0xe32119 (0x7f470f610119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f6482ca4c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f475b425e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7ff85f9e4897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f6482ca9a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #1: + 0xe32119 (0x7f509e99f119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: + 0x8609 (0x7f476046c609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #1: + 0xe32119 (0x7ff860947119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f50ea7b4e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f6482caadcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: + 0x8609 (0x7f50ef7fb609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #2: + 0xd3e95 (0x7ff8ac75ce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #4: clone + 0x43 (0x7f4760237353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #4: + 0xd3e95 (0x7f64ce743e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7ff8b17a3609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #5: + 0x8609 (0x7f64d378a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]: -[default4]:frame #4: clone + 0x43 (0x7ff8b156e353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #6: clone + 0x43 (0x7f64d3555353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]:frame #4: clone + 0x43 (0x7f50ef5c6353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default0]: -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f64819cb897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f648292e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f64ce743e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f64d378a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f64d3555353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 13] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 13] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 13] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f60dcc8c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f60ddf65c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f60ddf6aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f60ddf6bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f6129a04e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f612ea4b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f612e816353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600084 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f60dcc8c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f60ddf65c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f60ddf6aa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f60ddf6bdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f6129a04e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f612ea4b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f612e816353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f60dcc8c897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f60ddbef119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7f6129a04e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f612ea4b609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f612e816353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 14] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 14] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 14] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04edec5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04ef19ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04ef1a3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04ef1a4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f053ac3de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f053fc84609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f053fa4f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04edec5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f04ef19ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f04ef1a3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f04ef1a4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f053ac3de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f053fc84609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f053fa4f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f04edec5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f04eee28119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f053ac3de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f053fc84609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f053fa4f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -E0703 00:54:08.982000 139892035856192 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 498475) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_00:54:08 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 498476) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 498476 -[2]: - time : 2024-07-03_00:54:08 - host : ip-26-0-161-178.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 498477) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 498477 -[3]: - time : 2024-07-03_00:54:08 - host : ip-26-0-161-178.ec2.internal - rank : 11 (local_rank: 3) - exitcode : -6 (pid: 498478) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 498478 -[4]: - time : 2024-07-03_00:54:08 - host : ip-26-0-161-178.ec2.internal - rank : 12 (local_rank: 4) - exitcode : -6 (pid: 498479) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 498479 -[5]: - time : 2024-07-03_00:54:08 - host : ip-26-0-161-178.ec2.internal - rank : 13 (local_rank: 5) - exitcode : -6 (pid: 498480) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 498480 -[6]: - time : 2024-07-03_00:54:08 - host : ip-26-0-161-178.ec2.internal - rank : 14 (local_rank: 6) - exitcode : -6 (pid: 498481) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 498481 -[7]: - time : 2024-07-03_00:54:08 - host : ip-26-0-161-178.ec2.internal - rank : 15 (local_rank: 7) - exitcode : -6 (pid: 498482) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 498482 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_00:54:08 - host : ip-26-0-161-178.ec2.internal - rank : 8 (local_rank: 0) - exitcode : -6 (pid: 498475) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 498475 -============================================================ -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -[default0]:07/03/2024 01:01:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 699.61MiB. Peak allocated 1984.93MiB. Peak reserved: 2230.00MiB -[default0]:This error is detected remotely; typically encountered when the peer process is no longer present -[default0]:This error is detected remotely; typically encountered when the peer process is no longer present -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: output = model(**micro_batch) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 360, in forward -[default0]:[rank0]: qkv_states = self.qkv_proj( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward -[default0]:[rank0]: return column_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 355, in column_linear -[default0]:[rank0]: input = differentiable_all_gather(input, group=group) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 141, in differentiable_all_gather -[default0]:[rank0]: return DifferentiableAllGather.apply(tensor, group) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default0]:[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 82, in forward -[default0]:[rank0]: dist.all_gather_into_tensor(unsharded_tensor, tensor, group=group) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/distributed.py", line 64, in all_gather_into_tensor -[default0]:[rank0]: return dist.all_gather_into_tensor( -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper -[default0]:[rank0]: return func(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2948, in all_gather_into_tensor -[default0]:[rank0]: work = group._allgather_base(output_tensor, input_tensor, opts) -[default0]:[rank0]: torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2395, internal error - please report this issue to the NCCL developers, NCCL version 2.20.5 -[default0]:[rank0]: ncclInternalError: Internal check failed. -[default0]:[rank0]: Last error: -[default0]:[rank0]: NET/OFI Request completed with error -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 7] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 7] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 7] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 7] Process group watchdog thread terminated with exception: [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f452eb08897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f452fde1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f452fde6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f452fde7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f457b880e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f45808c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4580692353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 7] Process group watchdog thread terminated with exception: [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f452eb08897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f452fde1c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f452fde6a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f452fde7dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f457b880e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f45808c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f4580692353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f452eb08897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f452fa6b119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f457b880e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f45808c7609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f4580692353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 5] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 5] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 5] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 5] Process group watchdog thread terminated with exception: [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd533cf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd546a8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd546ada80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd546aedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fdda0147e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fdda518e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fdda4f59353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 5] Process group watchdog thread terminated with exception: [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600060 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd533cf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd546a8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd546ada80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd546aedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fdda0147e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fdda518e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fdda4f59353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd533cf897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fdd54332119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fdda0147e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fdda518e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fdda4f59353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 6] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 6] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 6] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 6] Process group watchdog thread terminated with exception: [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d37b40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d38e19c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d38e1ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d38e1fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2d848b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2d898ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2d896ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 6] Process group watchdog thread terminated with exception: [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600046 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d37b40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f2d38e19c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f2d38e1ea80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f2d38e1fdcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f2d848b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f2d898ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f2d896ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f2d37b40897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f2d38aa3119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f2d848b8e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f2d898ff609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f2d896ca353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 2] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f212a71f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f212b9f8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f212b9fda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f212b9fedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f2177497e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f217c4de609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 1] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default2]:frame #6: clone + 0x43 (0x7f217c2a9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fc8f3b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5fca214c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5fca219a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5fca21adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f6015cb3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f601acfa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f601aac5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: -[default1]: what(): [PG 2 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600068 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f212a71f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f212b9f8c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f212b9fda80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f212b9fedcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f2177497e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #5: + 0x8609 (0x7f217c4de609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fc8f3b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #6: clone + 0x43 (0x7f217c2a9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f5fca214c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f212a71f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f212b682119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f2177497e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f5fca219a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: + 0x8609 (0x7f217c4de609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f217c2a9353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f5fca21adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]: -[default1]:frame #4: + 0xd3e95 (0x7f6015cb3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f601acfa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f601aac5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f5fc8f3b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f5fc9e9e119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f6015cb3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f601acfa609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f601aac5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 4] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 4] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 4] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 4] Process group watchdog thread terminated with exception: [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc80d99f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc80ec78c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc80ec7da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc80ec7edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fc85a717e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fc85f75e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc85f529353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 4] Process group watchdog thread terminated with exception: [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc80d99f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fc80ec78c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fc80ec7da80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc80ec7edcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 3] Timeout at NCCL work: 305728, last enqueued NCCL work: 305849, last completed NCCL work: 305727. -[default4]:frame #4: + 0xd3e95 (0x7fc85a717e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:frame #5: + 0x8609 (0x7fc85f75e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fc85f529353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc80d99f897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fc80e902119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fc85a717e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fc85f75e609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c05d9b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #4: clone + 0x43 (0x7fc85f529353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c07074c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c07079a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c0707adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1c52b13e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1c57b5a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1c57925353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=305728, OpType=_REDUCE_SCATTER_BASE, NumelIn=8388608, NumelOut=524288, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c05d9b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f1c07074c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f1c07079a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f1c0707adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f1c52b13e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f1c57b5a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f1c57925353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f1c05d9b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f1c06cfe119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f1c52b13e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f1c57b5a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f1c57925353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 0] Process group watchdog thread terminated with exception: NCCL error: internal error - please report this issue to the NCCL developers, NCCL version 2.20.5 -[default0]:ncclInternalError: Internal check failed. -[default0]:Last error: -[default0]: -[default0]:Exception raised from checkForNCCLErrorsInternal at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1723 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc154763897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::checkForNCCLErrorsInternal(std::shared_ptr&) + 0x220 (0x7fc155a3c5f0 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::WorkNCCL::checkAndSetException() + 0x7c (0x7fc155a3c83c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::watchdogHandler() + 0x180 (0x7fc155a41a60 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc155a42dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #5: + 0xd3e95 (0x7fc1a14dbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #6: + 0x8609 (0x7fc1a6522609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #7: clone + 0x43 (0x7fc1a62ed353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 0] Process group watchdog thread terminated with exception: NCCL error: internal error - please report this issue to the NCCL developers, NCCL version 2.20.5 -[default0]:ncclInternalError: Internal check failed. -[default0]:Last error: -[default0]: -[default0]:Exception raised from checkForNCCLErrorsInternal at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1723 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc154763897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::checkForNCCLErrorsInternal(std::shared_ptr&) + 0x220 (0x7fc155a3c5f0 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::WorkNCCL::checkAndSetException() + 0x7c (0x7fc155a3c83c in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::watchdogHandler() + 0x180 (0x7fc155a41a60 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fc155a42dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #5: + 0xd3e95 (0x7fc1a14dbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #6: + 0x8609 (0x7fc1a6522609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #7: clone + 0x43 (0x7fc1a62ed353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc154763897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fc1556c6119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fc1a14dbe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fc1a6522609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fc1a62ed353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -W0703 01:01:14.350000 140372375672640 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1111136 closing signal SIGTERM -E0703 01:01:16.667000 140372375672640 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 1111137) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:01:14 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 1111138) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1111138 -[2]: - time : 2024-07-03_01:01:14 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 1111139) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1111139 -[3]: - time : 2024-07-03_01:01:14 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 1111140) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1111140 -[4]: - time : 2024-07-03_01:01:14 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 1111141) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1111141 -[5]: - time : 2024-07-03_01:01:14 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 1111142) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1111142 -[6]: - time : 2024-07-03_01:01:14 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 1111143) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1111143 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:01:14 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 1111137) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 1111137 -============================================================ -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -W0703 01:01:17.864000 139668758304512 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1835128_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:18.568000 140053635184384 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_871296_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:18.608000 139939675473664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1034730_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:18.670000 140206617364224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3194703_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:18.671000 139715185768192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1807469_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:18.707000 139978888746752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_876381_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:19.209000 140059295917888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871371 closing signal SIGTERM -W0703 01:01:19.210000 140059295917888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871372 closing signal SIGTERM -W0703 01:01:19.210000 140059295917888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871373 closing signal SIGTERM -W0703 01:01:19.210000 140059295917888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871374 closing signal SIGTERM -W0703 01:01:19.211000 140059295917888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871375 closing signal SIGTERM -W0703 01:01:19.211000 140059295917888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871376 closing signal SIGTERM -W0703 01:01:19.212000 140059295917888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871377 closing signal SIGTERM -W0703 01:01:19.212000 140059295917888 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 871378 closing signal SIGTERM -W0703 01:01:19.262000 139945336207168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1034805 closing signal SIGTERM -W0703 01:01:19.262000 139945336207168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1034806 closing signal SIGTERM -W0703 01:01:19.262000 139945336207168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1034807 closing signal SIGTERM -W0703 01:01:19.263000 139945336207168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1034808 closing signal SIGTERM -W0703 01:01:19.266000 139945336207168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1034809 closing signal SIGTERM -W0703 01:01:19.266000 139945336207168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1034810 closing signal SIGTERM -W0703 01:01:19.266000 139945336207168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1034811 closing signal SIGTERM -W0703 01:01:19.268000 139945336207168 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1034812 closing signal SIGTERM -W0703 01:01:19.271000 139984549480256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 876456 closing signal SIGTERM -W0703 01:01:19.271000 139984549480256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 876457 closing signal SIGTERM -W0703 01:01:19.271000 139984549480256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 876458 closing signal SIGTERM -W0703 01:01:19.272000 139984549480256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 876459 closing signal SIGTERM -W0703 01:01:19.273000 139984549480256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 876460 closing signal SIGTERM -W0703 01:01:19.273000 139984549480256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 876461 closing signal SIGTERM -W0703 01:01:19.273000 139984549480256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 876462 closing signal SIGTERM -W0703 01:01:19.274000 139984549480256 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 876463 closing signal SIGTERM -W0703 01:01:19.277000 140212278097728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3194779 closing signal SIGTERM -W0703 01:01:19.278000 140212278097728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3194780 closing signal SIGTERM -W0703 01:01:19.278000 140212278097728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3194781 closing signal SIGTERM -W0703 01:01:19.278000 140212278097728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3194782 closing signal SIGTERM -W0703 01:01:19.279000 140212278097728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3194783 closing signal SIGTERM -W0703 01:01:19.279000 140212278097728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3194784 closing signal SIGTERM -W0703 01:01:19.280000 140212278097728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3194785 closing signal SIGTERM -W0703 01:01:19.282000 140212278097728 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3194786 closing signal SIGTERM -W0703 01:01:19.283000 139720846501696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1807545 closing signal SIGTERM -W0703 01:01:19.284000 139720846501696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1807546 closing signal SIGTERM -W0703 01:01:19.284000 139720846501696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1807547 closing signal SIGTERM -W0703 01:01:19.285000 139720846501696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1807548 closing signal SIGTERM -W0703 01:01:19.286000 139720846501696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1807549 closing signal SIGTERM -W0703 01:01:19.286000 139720846501696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1807550 closing signal SIGTERM -W0703 01:01:19.288000 139720846501696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1807551 closing signal SIGTERM -W0703 01:01:19.289000 139720846501696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1807552 closing signal SIGTERM -W0703 01:01:19.377000 139674419038016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1835204 closing signal SIGTERM -W0703 01:01:19.378000 139674419038016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1835205 closing signal SIGTERM -W0703 01:01:19.378000 139674419038016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1835206 closing signal SIGTERM -W0703 01:01:19.379000 139674419038016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1835207 closing signal SIGTERM -W0703 01:01:19.381000 139674419038016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1835208 closing signal SIGTERM -W0703 01:01:19.381000 139674419038016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1835209 closing signal SIGTERM -W0703 01:01:19.382000 139674419038016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1835210 closing signal SIGTERM -W0703 01:01:19.382000 139674419038016 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1835211 closing signal SIGTERM -W0703 01:01:22.869000 139668758304512 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1835128_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:23.573000 140053635184384 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_871296_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:23.612000 139939675473664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1034730_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:23.674000 140206617364224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3194703_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:23.676000 139715185768192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1807469_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:23.716000 139978888746752 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-73.ec2.internal_876381_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:26.739000 139984549480256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_876381_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:26.757000 139984549480256 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-73.ec2.internal_876381_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -W0703 01:01:27.873000 139668758304512 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1835128_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:28.577000 140053635184384 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-165-24.ec2.internal_871296_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:28.616000 139939675473664 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-172-57.ec2.internal_1034730_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:28.678000 140206617364224 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-163-226.ec2.internal_3194703_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:28.680000 139715185768192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1807469_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:30.479000 139945336207168 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1034730_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:30.499000 139945336207168 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-172-57.ec2.internal_1034730_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 01:01:30.750000 140212278097728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3194703_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:30.772000 140212278097728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-163-226.ec2.internal_3194703_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 01:01:31.110000 139720846501696 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1807469_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:31.125000 139720846501696 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1807469_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -W0703 01:01:31.464000 140059295917888 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_871296_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -srun: error: ip-26-0-163-226: task 2: Exited with exit code 1 -W0703 01:01:31.479000 140059295917888 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_871296_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 01:01:31.577000 139674419038016 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1835128_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 01:01:31.595000 139674419038016 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1835128_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -srun: error: ip-26-0-165-24: task 3: Exited with exit code 1 -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-160-192_1111136.1719968208257886371.pt.trace.json: 0%| | 0.00/36.0G [00:00 $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128 llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128 --commit-message "Upload llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/config.yaml b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/config.yaml deleted file mode 100644 index 83de57fb0ee524d3ad19edbd893cc3ed1ce603ae..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 4 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 2 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 128 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/log.out b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/log.out deleted file mode 100644 index 125334e0462699df2c4963ee4c30c1b871066cea..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/log.out +++ /dev/null @@ -1,4318 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:20:00 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:20:06.085000 139636848924480 torch/distributed/run.py:757] -W0703 03:20:06.085000 139636848924480 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.085000 139636848924480 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:20:06.085000 139636848924480 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.133000 140234749704000 torch/distributed/run.py:757] -W0703 03:20:06.133000 140234749704000 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.133000 140234749704000 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:20:06.133000 140234749704000 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.251000 140070019192640 torch/distributed/run.py:757] -W0703 03:20:06.251000 140070019192640 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.251000 140070019192640 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:20:06.251000 140070019192640 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.271000 140510439348032 torch/distributed/run.py:757] -W0703 03:20:06.271000 140510439348032 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.271000 140510439348032 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:20:06.271000 140510439348032 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.363000 140345902331712 torch/distributed/run.py:757] -W0703 03:20:06.363000 140345902331712 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.363000 140345902331712 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:20:06.363000 140345902331712 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.536000 140570660591424 torch/distributed/run.py:757] -W0703 03:20:06.536000 140570660591424 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.536000 140570660591424 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:20:06.536000 140570660591424 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.540000 139845608257344 torch/distributed/run.py:757] -W0703 03:20:06.540000 139845608257344 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.540000 139845608257344 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:20:06.540000 139845608257344 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.622000 140657039120192 torch/distributed/run.py:757] -W0703 03:20:06.622000 140657039120192 torch/distributed/run.py:757] ***************************************** -W0703 03:20:06.622000 140657039120192 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:20:06.622000 140657039120192 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:20:32 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=4, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=16, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=128, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=2, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128')), -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/03/2024 03:20:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default5]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=5|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=0|ip-26-0-172-57]: No checkpoint path provided. -[default5]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=13|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=2|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=1|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=7|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=6|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=4|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=3|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=14|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=15|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=9|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=11|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=8|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=12|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 03:20:50 [INFO|DP=3|PP=0|TP=10|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=9|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=8|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=7|ip-26-0-163-220]: No checkpoint path provided. -[default4]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=4|ip-26-0-163-220]: No checkpoint path provided. -[default1]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=1|ip-26-0-163-220]: No checkpoint path provided. -[default3]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=11|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=0|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=5|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=15|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=12|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=10|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=3|ip-26-0-163-220]: No checkpoint path provided. -[default7]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=6|ip-26-0-163-220]: No checkpoint path provided. -[default2]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=2|ip-26-0-163-220]: No checkpoint path provided. -[default5]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=13|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 03:20:50 [INFO|DP=1|PP=0|TP=14|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2119.44MiB) -[default0]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default1]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default2]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default3]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 03:20:50 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default3]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default7]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default5]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/03/2024 03:20:51 [INFO|DP=2|PP=0|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/03/2024 03:20:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:20:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:20:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:20:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:20:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 2 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:20:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 3 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:20:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:20:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/03/2024 03:20:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 03:20:54 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:20:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:20:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:20:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/03/2024 03:20:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-03 03:20:56.745354 | mbs: 128 | grad_accum: 2 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:20:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:20:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 490.87MiB. Peak allocated 490.87MiB. Peak reserved: 512.00MiB -[default2]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=14|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=1|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=0|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=9|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=15|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=11|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=13|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=0|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=14|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:20:56 [WARNING|DP=2|PP=0|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=1|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=2|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:20:56 [WARNING|DP=2|PP=0|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:20:56 [WARNING|DP=2|PP=0|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:20:56 [WARNING|DP=2|PP=0|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:20:56 [WARNING|DP=2|PP=0|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:20:56 [WARNING|DP=2|PP=0|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=15|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:20:57 [WARNING|DP=1|PP=0|TP=5|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=4|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=8|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=8|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=12|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=10|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=6|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:20:57 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=2|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:20:56 [WARNING|DP=1|PP=0|TP=3|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=5|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:20:56 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:20:56 [WARNING|DP=2|PP=0|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=7|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:20:56 [WARNING|DP=3|PP=0|TP=6|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:20:57 [WARNING|DP=3|PP=0|TP=3|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:20:57 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:20:57 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:20:56 [WARNING|DP=2|PP=0|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:20:56 [WARNING|DP=2|PP=0|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:20:57 [WARNING|DP=3|PP=0|TP=9|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:20:57 [WARNING|DP=1|PP=0|TP=7|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:20:57 [WARNING|DP=3|PP=0|TP=12|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:20:57 [WARNING|DP=2|PP=0|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:20:57 [WARNING|DP=3|PP=0|TP=4|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:20:57 [WARNING|DP=2|PP=0|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:20:57 [WARNING|DP=2|PP=0|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:20:57 [WARNING|DP=2|PP=0|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:20:57 [WARNING|DP=1|PP=0|TP=11|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:20:57 [WARNING|DP=3|PP=0|TP=10|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:20:57 [WARNING|DP=2|PP=0|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:20:57 [WARNING|DP=3|PP=0|TP=13|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:20:57 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:20:57 [WARNING|DP=2|PP=0|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:[rank30]: Traceback (most recent call last): -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19][default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank19]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank19]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[de[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -fault3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return row_linear( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.39 GiB is free. Including non-PyTorch memory, this process has 77.93 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = model(**micro_batch) -[default0]:[rank24]: output = model(**micro_batch) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: output = model(**micro_batch) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default5]:[rank29]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: sharded_logits = self.model( -[default0]:[rank24]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank16]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank16]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: Traceback (most recent call last): -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: trainer.train(dataloader) -[default0]:[rank16]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank17]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default1]:[rank17]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank29]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank18]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank18]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.39 GiB is free. Including non-PyTorch memory, this process has 77.93 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank30]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.84 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default1]:[rank25]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[def[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -ault5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank21]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modul[default4]:[rank28]: output = model(**micro_batch) -es/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank21]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwa[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -rgs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank21]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.39 GiB is free. Including non-PyTorch memory, this process has 77.93 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default1]:[rank25]: return row_linear( -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank24]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return row_linear( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default4]:[rank28]: sharded_logits = self.model( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.44 GiB is free. Including non-PyTorch memory, this process has 77.88 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank24]: return row_linear( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.05 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default3]:[rank27]: Traceback (most recent call last): -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.44 GiB is free. Including non-PyTorch memory, this process has 77.88 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: trainer.train(dataloader) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.05 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank20]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank20]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/ten[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -sor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.84 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank22]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.[default3]:[rank27]: return self._call_impl(*args, **kwargs) -10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank22]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: return row_linear( -[defaul[default7]:[rank31]: output = model(**micro_batch) -t6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.84 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: sharded_logits = self.model( -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotro[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -n/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank23]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank23]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank28]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.39 GiB is free. Including non-PyTorch memory, this process has 77.93 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return row_linear( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.05 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank31]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank27]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank31]: return row_linear( -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.44 GiB is free. Including non-PyTorch memory, this process has 77.88 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank27]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.44 GiB is free. Including non-PyTorch memory, this process has 77.88 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank56]: output = self.pp_block(**new_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank56]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank56]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank56]: return row_linear( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default0]:[rank56]: out = differentiable_reduce_scatter_sum(out, group=group) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default0]:[rank56]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default0]:[rank56]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default0]:[rank56]: sharded_tensor = torch.empty( -[default0]:[rank56]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank60]: output = self.pp_block(**new_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank60]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank60]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank60]: return row_linear( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default4]:[rank60]: out = differentiable_reduce_scatter_sum(out, group=group) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default4]:[rank60]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default4]:[rank60]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default4]:[rank60]: sharded_tensor = torch.empty( -[default4]:[rank60]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank62]: output = self.pp_block(**new_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank62]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank62]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank62]: return row_linear( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default6]:[rank62]: out = differentiable_reduce_scatter_sum(out, group=group) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default6]:[rank62]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default6]:[rank62]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default6]:[rank62]: sharded_tensor = torch.empty( -[default6]:[rank62]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank49]: Traceback (most recent call last): -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: Traceback (most recent call last): -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: sharded_logits = self.model( -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default3]:[rank51]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: output = self.pp_block(**new_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: output = self.pp_block(**new_kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank49]: output = self.pp_block(**new_kwargs) -[default1]:[rank57]: return row_linear( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank51]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: output = self.pp_block(**new_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank57]: out = differentiable_reduce_scatter_sum(out, group=group) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default3]:[rank51]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default5]:[rank61]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default1]:[rank49]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank57]: sharded_tensor = torch.empty( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 101.94 MiB is free. Including non-PyTorch memory, this process has 79.22 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank51]: return row_linear( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank61]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank51]: out = F.linear(input, weight, bias) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank61]: return row_linear( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default5]:[rank61]: out = differentiable_reduce_scatter_sum(out, group=group) -[default3]:[rank51]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.43 GiB is free. Including non-PyTorch memory, this process has 77.89 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default5]:[rank61]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank61]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default5]:[rank61]: sharded_tensor = torch.empty( -[default1]:[rank49]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 101.94 MiB is free. Including non-PyTorch memory, this process has 79.22 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank49]: return row_linear( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank49]: out = F.linear(input, weight, bias) -[default1]:[rank49]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.43 GiB is free. Including non-PyTorch memory, this process has 77.89 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default2]:[rank58]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: sharded_logits = self.model( -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank59]: output = self.pp_block(**new_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank59]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank58]: output = self.pp_block(**new_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank58]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank59]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank59]: return row_linear( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default3]:[rank59]: out = differentiable_reduce_scatter_sum(out, group=group) -[default2]:[rank58]: return row_linear( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default3]:[rank59]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default2]:[rank58]: out = differentiable_reduce_scatter_sum(out, group=group) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default2]:[rank58]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default3]:[rank59]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default2]:[rank58]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default3]:[rank59]: sharded_tensor = torch.empty( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default3]:[rank59]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 101.94 MiB is free. Including non-PyTorch memory, this process has 79.22 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank58]: sharded_tensor = torch.empty( -[default2]:[rank58]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank53]: output = self.pp_block(**new_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank53]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank53]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank53]: return row_linear( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank53]: out = F.linear(input, weight, bias) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default[default5]:[rank53]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.43 GiB is free. Including non-PyTorch memory, this process has 77.89 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank63]: output = self.pp_block(**new_kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank63]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank63]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank63]: return row_linear( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default7]:[rank63]: out = differentiable_reduce_scatter_sum(out, group=group) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default7]:[rank63]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default7]:[rank63]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default7]:[rank63]: sharded_tensor = torch.empty( -[default7]:[rank63]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 101.94 MiB is free. Including non-PyTorch memory, this process has 79.22 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank50]: output = self.pp_block(**new_kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank50]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank50]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return row_linear( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank50]: out = F.linear(input, weight, bias) -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default2]:[rank50]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.51 GiB is free. Including non-PyTorch memory, this process has 77.80 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank55]: output = self.pp_block(**new_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank55]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank55]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank55]: return row_linear( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank55]: out = F.linear(input, weight, bias) -[default7]:[rank55]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.43 GiB is free. Including non-PyTorch memory, this process has 77.89 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: trainer.train(dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank48]: output = self.pp_block(**new_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank48]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank48]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank48]: return row_linear( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank48]: out = F.linear(input, weight, bias) -[default0]:[rank48]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank52]: output = self.pp_block(**new_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank52]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank52]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank52]: return row_linear( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank52]: out = F.linear(input, weight, bias) -[default4]:[rank52]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.51 GiB is free. Including non-PyTorch memory, this process has 77.80 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank54]: output = self.pp_block(**new_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank54]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank54]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank54]: return row_linear( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank54]: out = F.linear(input, weight, bias) -[default6]:[rank54]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.51 GiB is free. Including non-PyTorch memory, this process has 77.80 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank14]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank14]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.43 GiB is free. Including non-PyTorch memory, this process has 77.89 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank11]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank11]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.51 GiB is free. Including non-PyTorch memory, this process has 77.80 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank9]: trainer.train(dataloader) -[default0]:[rank8]: Traceback (most recent call last): -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: sharded_logits = self.model( -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: sharded_logits = self.model( -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank13]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank9]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank8]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank9]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default1]:[rank9]: return row_linear( -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.51 GiB is free. Including non-PyTorch memory, this process has 77.80 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.51 GiB is free. Including non-PyTorch memory, this process has 77.80 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank8]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank2]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default2]:[rank2]: out = differentiable_reduce_scatter_sum(out, group=group) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default2]:[rank2]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default2]:[rank2]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default2]:[rank2]: sharded_tensor = torch.empty( -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 101.94 MiB is free. Including non-PyTorch memory, this process has 79.22 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank10]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank10]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.43 GiB is free. Including non-PyTorch memory, this process has 77.89 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank15]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank15]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.51 GiB is free. Including non-PyTorch memory, this process has 77.80 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: trainer.train(dataloader) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank5]: output = model(**micro_batch) -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank1]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: sharded_logits = self.model( -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12][default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank12]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank12]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/ten[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -sor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return row_linear( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.43 GiB is free. Including non-PyTorch memory, this process has 77.89 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank4]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default5]:[rank5]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return row_linear( -[default1]:[rank1]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default4]:[rank4]: out = differentiable_reduce_scatter_sum(out, group=group) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default4]:[rank4]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return row_linear( -[default4]:[rank4]: sharded_tensor = torch.empty( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 101.94 MiB is free. Including non-PyTorch memory, this process has 79.22 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: return row_linear( -[default1]:[rank1]: out = differentiable_reduce_scatter_sum(out, group=group) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default5]:[rank5]: out = differentiable_reduce_scatter_sum(out, group=group) -[default1]:[rank1]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default1]:[rank1]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default5]:[rank5]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default1]:[rank1]: sharded_tensor = torch.empty( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default5]:[rank5]: sharded_tensor = torch.empty( -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank3]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: return row_linear( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default3]:[rank3]: out = differentiable_reduce_scatter_sum(out, group=group) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default3]:[rank3]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default3]:[rank3]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default3]:[rank3]: sharded_tensor = torch.empty( -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank6]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: return row_linear( -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default6]:[rank6]: out = differentiable_reduce_scatter_sum(out, group=group) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default6]:[rank6]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default6]:[rank6]: sharded_tensor = torch.empty( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 101.94 MiB is free. Including non-PyTorch memory, this process has 79.22 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank0]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: return row_linear( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default0]:[rank0]: out = differentiable_reduce_scatter_sum(out, group=group) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default0]:[rank0]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default0]:[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default0]:[rank0]: sharded_tensor = torch.empty( -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank7]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank7]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear -[default7]:[rank7]: out = differentiable_reduce_scatter_sum(out, group=group) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum -[default7]:[rank7]: return DifferentiableReduceScatterSum.apply(tensor, group) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply -[default7]:[rank7]: return super().apply(*args, **kwargs) # type: ignore[misc] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward -[default7]:[rank7]: sharded_tensor = torch.empty( -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 70.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -W0703 03:21:13.630000 140570660591424 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 902600 closing signal SIGTERM -W0703 03:21:13.630000 140570660591424 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 902601 closing signal SIGTERM -W0703 03:21:13.630000 140570660591424 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 902602 closing signal SIGTERM -W0703 03:21:13.630000 140570660591424 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 902603 closing signal SIGTERM -W0703 03:21:13.630000 140570660591424 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 902605 closing signal SIGTERM -W0703 03:21:13.630000 140570660591424 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 902606 closing signal SIGTERM -E0703 03:21:15.058000 140570660591424 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 902599) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:21:13 - host : ip-26-0-172-73.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 902604) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:21:13 - host : ip-26-0-172-73.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 902599) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-172-73: task 7: Exited with exit code 1 -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:21:18 [WARNING|DP=2|PP=0|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank42]: output = self.pp_block(**new_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank42]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank42]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank42]: return row_linear( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank42]: out = F.linear(input, weight, bias) -[default2]:[rank42]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.39 GiB is free. Including non-PyTorch memory, this process has 77.93 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank45]: output = self.pp_block(**new_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default5]:[rank45]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default5]:[rank45]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank45]: return row_linear( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank45]: out = F.linear(input, weight, bias) -[default5]:[rank45]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.84 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: trainer.train(dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank44]: output = self.pp_block(**new_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank47]: output = self.pp_block(**new_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank44]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank44]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank44]: return row_linear( -[default7]:[rank47]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: out = F.linear(input, weight, bias) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.39 GiB is free. Including non-PyTorch memory, this process has 77.93 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank47]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank47]: return row_linear( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank47]: out = F.linear(input, weight, bias) -[default7]:[rank47]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.84 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: Traceback (most recent call last): -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = model(**micro_batch) -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: sharded_logits = self.model( -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: output = self.pp_block(**new_kwargs) -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank43]: output = self.pp_block(**new_kwargs) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: output = self.pp_block(**new_kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default1]:[rank41]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default3]:[rank43]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return row_linear( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank46]: out = F.linear(input, weight, bias) -[default3]:[rank43]: return row_linear( -[default6]:[rank46]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.39 GiB is free. Including non-PyTorch memory, this process has 77.93 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: out = F.linear(input, weight, bias) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank43]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.84 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank41]: return row_linear( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank41]: out = F.linear(input, weight, bias) -[default1]:[rank41]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.47 GiB is free. Including non-PyTorch memory, this process has 77.84 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank40]: output = self.pp_block(**new_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default0]:[rank40]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default0]:[rank40]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank40]: return row_linear( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank40]: out = F.linear(input, weight, bias) -[default0]:[rank40]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank33]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: output = model(**micro_batch) -[default3]:[rank35]: trainer.train(dataloader) -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: output = model(**micro_batch) -[default2]:[rank34]: output = model(**micro_batch) -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank34]: sharded_logits = self.model( -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank32]: output = self.pp_block(**new_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default7]:[rank39]: Traceback (most recent call last): -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: output = self.pp_block(**new_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank34]: output = self.pp_block(**new_kwargs) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default1]:[rank33]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default0]:[rank32]: return row_linear( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: out = F.linear(input, weight, bias) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank38]: output = self.pp_block(**new_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank34]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default0]:[rank32]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank33]: return row_linear( -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default2]:[rank34]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: out = F.linear(input, weight, bias) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank35]: output = self.pp_block(**new_kwargs) -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: output = model(**micro_batch) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank33]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.05 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: return row_linear( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default3]:[rank35]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default2]:[rank34]: out = F.linear(input, weight, bias) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.44 GiB is free. Including non-PyTorch memory, this process has 77.88 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank35]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank35]: return row_linear( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank35]: out = F.linear(input, weight, bias) -[default3]:[rank35]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.05 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank39]: output = self.pp_block(**new_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default7]:[rank39]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default7]:[rank39]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank39]: return row_linear( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank39]: out = F.linear(input, weight, bias) -[default7]:[rank39]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.05 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank38]: return row_linear( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank38]: out = F.linear(input, weight, bias) -[default6]:[rank38]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.44 GiB is free. Including non-PyTorch memory, this process has 77.88 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: Traceback (most recent call last): -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank36]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank37]: output = self.pp_block(**new_kwargs) -[default4]:[rank36]: output = self.pp_block(**new_kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank37]: return row_linear( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank37]: out = F.linear(input, weight, bias) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default5]:[rank37]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.27 GiB is free. Including non-PyTorch memory, this process has 78.05 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward -[default4]:[rank36]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward -[default4]:[rank36]: hidden_states = self.down_proj(self.split_silu_mul(merged_states)) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank36]: return row_linear( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank36]: out = F.linear(input, weight, bias) -[default4]:[rank36]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.44 GiB is free. Including non-PyTorch memory, this process has 77.88 GiB memory in use. Of the allocated memory 68.20 GiB is allocated by PyTorch, and 79.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -W0703 03:21:23.645000 140657039120192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 523703 closing signal SIGTERM -W0703 03:21:23.645000 140657039120192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 523705 closing signal SIGTERM -W0703 03:21:23.645000 140657039120192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 523706 closing signal SIGTERM -W0703 03:21:23.645000 140657039120192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 523707 closing signal SIGTERM -W0703 03:21:23.646000 140657039120192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 523708 closing signal SIGTERM -W0703 03:21:23.646000 140657039120192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 523709 closing signal SIGTERM -W0703 03:21:23.646000 140657039120192 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 523710 closing signal SIGTERM -E0703 03:21:23.772000 139636848924480 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 773369) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:21:23.774000 140070019192640 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3219489) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:21:23.775000 140345902331712 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1137145) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 03:21:23.779000 140510439348032 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1060585) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-220.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 773370) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-220.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 773371) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-220.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 773372) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-220.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 773373) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-220.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 773374) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-220.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 773375) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-220.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 773376) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-220.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 773369) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:21:23 - host : ip-26-0-160-192.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1137146) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:21:23 - host : ip-26-0-160-192.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1137147) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:21:23 - host : ip-26-0-160-192.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1137148) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:21:23 - host : ip-26-0-160-192.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1137149) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:21:23 - host : ip-26-0-160-192.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1137150) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:21:23 - host : ip-26-0-160-192.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1137151) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:21:23 - host : ip-26-0-160-192.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1137152) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:21:23 - host : ip-26-0-160-192.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1137145) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:21:23 - host : ip-26-0-172-57.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 1060586) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:21:23 - host : ip-26-0-172-57.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 1060587) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:21:23 - host : ip-26-0-172-57.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 1060588) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:21:23 - host : ip-26-0-172-57.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 1060589) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:21:23 - host : ip-26-0-172-57.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 1060590) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:21:23 - host : ip-26-0-172-57.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 1060591) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:21:23 - host : ip-26-0-172-57.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 1060592) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:21:23 - host : ip-26-0-172-57.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 1060585) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-226.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 3219490) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-226.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 3219491) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-226.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 3219492) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-226.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 3219493) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-226.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 3219494) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-226.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 3219495) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-226.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 3219496) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:21:23 - host : ip-26-0-163-226.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 3219489) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-163-220: task 2: Exited with exit code 1 -srun: error: ip-26-0-163-226: task 3: Exited with exit code 1 -srun: error: ip-26-0-172-57: task 6: Exited with exit code 1 -srun: error: ip-26-0-160-192: task 0: Exited with exit code 1 -E0703 03:21:25.571000 140657039120192 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 523704) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -W0703 03:21:25.579000 140657039120192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_523630_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:21:25.606000 140657039120192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_523630_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:21:25.614000 140657039120192 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-178.ec2.internal_523630_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:21:23 - host : ip-26-0-161-178.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 523704) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-178: task 1: Exited with exit code 1 -W0703 03:21:27.930000 139839947523840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1834089_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:21:28.286000 140229088970496 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1860288_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:21:28.653000 139845608257344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1834167 closing signal SIGTERM -W0703 03:21:28.653000 139845608257344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1834168 closing signal SIGTERM -W0703 03:21:28.653000 139845608257344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1834169 closing signal SIGTERM -W0703 03:21:28.654000 139845608257344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1834170 closing signal SIGTERM -W0703 03:21:28.656000 139845608257344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1834171 closing signal SIGTERM -W0703 03:21:28.656000 139845608257344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1834172 closing signal SIGTERM -W0703 03:21:28.657000 139845608257344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1834173 closing signal SIGTERM -W0703 03:21:28.658000 139845608257344 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1834174 closing signal SIGTERM -W0703 03:21:28.667000 140234749704000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1860368 closing signal SIGTERM -W0703 03:21:28.668000 140234749704000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1860369 closing signal SIGTERM -W0703 03:21:28.668000 140234749704000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1860370 closing signal SIGTERM -W0703 03:21:28.670000 140234749704000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1860371 closing signal SIGTERM -W0703 03:21:28.670000 140234749704000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1860372 closing signal SIGTERM -W0703 03:21:28.670000 140234749704000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1860373 closing signal SIGTERM -W0703 03:21:28.670000 140234749704000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1860374 closing signal SIGTERM -W0703 03:21:28.671000 140234749704000 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1860375 closing signal SIGTERM -W0703 03:21:32.934000 139839947523840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1834089_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:21:33.290000 140229088970496 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-168-238.ec2.internal_1860288_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:21:37.938000 139839947523840 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-86.ec2.internal_1834089_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:21:38.116000 140234749704000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1860288_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:21:38.130000 140234749704000 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-168-238.ec2.internal_1860288_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 03:21:38.203000 139845608257344 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1834089_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 03:21:38.214000 139845608257344 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-86.ec2.internal_1834089_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-168-238: task 4: Exited with exit code 1 -srun: error: ip-26-0-169-86: task 5: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/status.txt b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-128/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/bench.slurm b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/bench.slurm deleted file mode 100644 index 34249f15ef18e5ff437884c17f746a4a69d7bbef..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2 llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2 --commit-message "Upload llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/config.yaml b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/config.yaml deleted file mode 100644 index 5ebd37150e71bd9fdb4f151813dd3c8ac3b8d13f..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 4 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 128 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 2 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/log.out b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/log.out deleted file mode 100644 index f4f2eb5ddf2888da70fb92ab5267898ee52223e2..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2/log.out +++ /dev/null @@ -1,1736 +0,0 @@ -======================== -START TIME: Wed Jul 3 04:09:49 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 04:09:51.644000 139953438979904 torch/distributed/run.py:757] -W0703 04:09:51.644000 139953438979904 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.644000 139953438979904 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:09:51.644000 139953438979904 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.669000 139977578473280 torch/distributed/run.py:757] -W0703 04:09:51.669000 139977578473280 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.669000 139977578473280 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:09:51.669000 139977578473280 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.692000 140281171007296 torch/distributed/run.py:757] -W0703 04:09:51.692000 140281171007296 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.692000 140281171007296 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:09:51.692000 140281171007296 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.716000 139735782631232 torch/distributed/run.py:757] -W0703 04:09:51.716000 139735782631232 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.716000 139735782631232 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:09:51.716000 139735782631232 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.717000 139761185748800 torch/distributed/run.py:757] -W0703 04:09:51.717000 139761185748800 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.717000 139761185748800 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:09:51.717000 139761185748800 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.720000 140131790595904 torch/distributed/run.py:757] -W0703 04:09:51.720000 140131790595904 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.720000 140131790595904 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:09:51.720000 140131790595904 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.727000 140508885550912 torch/distributed/run.py:757] -W0703 04:09:51.727000 140508885550912 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.727000 140508885550912 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:09:51.727000 140508885550912 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.779000 140638446999360 torch/distributed/run.py:757] -W0703 04:09:51.779000 140638446999360 torch/distributed/run.py:757] ***************************************** -W0703 04:09:51.779000 140638446999360 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 04:09:51.779000 140638446999360 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 04:10:12 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=4, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=2, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=128, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-2')), -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 04:10:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default3]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=13|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=12|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.11G (2119.44MiB) -[default0]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default7]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default2]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=10|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=9|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=7|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-138]: No checkpoint path provided. -[default0]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=4|ip-26-0-161-138]: No checkpoint path provided. -[default6]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default5]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=5|ip-26-0-161-138]: No checkpoint path provided. -[default4]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default6]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=6|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=11|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=15|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=8|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 04:10:29 [INFO|DP=1|PP=0|TP=14|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=1|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=0|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=6|ip-26-0-161-78]: No checkpoint path provided. -[default0]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=5|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=7|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 04:10:29 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=3|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=4|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=2|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 04:10:29 [INFO|DP=2|PP=0|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 04:10:29 [INFO|DP=3|PP=0|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 04:10:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 04:10:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 04:10:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 04:10:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 04:10:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 2 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 04:10:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 3 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 04:10:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 04:10:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 04:10:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 04:10:34 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:10:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 04:10:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 04:10:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 04:10:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 04:10:37.493207 | mbs: 2 | grad_accum: 128 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 04:10:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 04:10:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 490.87MiB. Peak allocated 490.87MiB. Peak reserved: 512.00MiB -[default7]:07/03/2024 04:10:37 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:10:37 [WARNING|DP=3|PP=0|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:10:37 [WARNING|DP=3|PP=0|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:10:37 [WARNING|DP=3|PP=0|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:10:37 [WARNING|DP=2|PP=0|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:10:37 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:10:38 [WARNING|DP=3|PP=0|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:10:38 [WARNING|DP=3|PP=0|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=13|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=5|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:10:38 [WARNING|DP=3|PP=0|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:10:38 [WARNING|DP=3|PP=0|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=2|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=8|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=9|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=6|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=7|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=4|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:10:38 [WARNING|DP=3|PP=0|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=12|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:10:38 [WARNING|DP=3|PP=0|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=6|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=5|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=3|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:10:38 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=15|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=14|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:10:38 [WARNING|DP=3|PP=0|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:10:38 [WARNING|DP=3|PP=0|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:10:38 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:10:38 [WARNING|DP=2|PP=0|TP=0|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 04:10:39 [WARNING|DP=2|PP=0|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:10:39 [WARNING|DP=2|PP=0|TP=1|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:10:39 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:10:39 [WARNING|DP=3|PP=0|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 04:10:39 [WARNING|DP=3|PP=0|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 04:10:40 [WARNING|DP=3|PP=0|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 04:10:41 [WARNING|DP=3|PP=0|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 04:10:42 [WARNING|DP=3|PP=0|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 04:10:48 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-138]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default7]:07/03/2024 04:10:48 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-138]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default7]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default7]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default2]:07/03/2024 04:10:48 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-153]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default2]:07/03/2024 04:10:48 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-153]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default2]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default2]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default3]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default3]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default3]:07/03/2024 04:10:48 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default3]:07/03/2024 04:10:48 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default3]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default3]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default7]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default7]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default7]:07/03/2024 04:10:49 [WARNING|DP=2|PP=0|TP=7|ip-26-0-161-78]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default7]:07/03/2024 04:10:49 [WARNING|DP=2|PP=0|TP=7|ip-26-0-161-78]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default3]:07/03/2024 04:10:49 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-153]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default3]:07/03/2024 04:10:49 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-153]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default7]:07/03/2024 04:10:49 [WARNING|DP=3|PP=0|TP=15|ip-26-0-171-88]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default7]:07/03/2024 04:10:49 [WARNING|DP=3|PP=0|TP=15|ip-26-0-171-88]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default7]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default7]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default6]:07/03/2024 04:10:50 [WARNING|DP=2|PP=0|TP=6|ip-26-0-161-78]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default6]:07/03/2024 04:10:50 [WARNING|DP=2|PP=0|TP=6|ip-26-0-161-78]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default4]:07/03/2024 04:10:50 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default6]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default1]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default1]:07/03/2024 04:10:50 [WARNING|DP=2|PP=0|TP=9|ip-26-0-171-102]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:07/03/2024 04:10:50 [WARNING|DP=2|PP=0|TP=9|ip-26-0-171-102]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default4]:07/03/2024 04:10:52 [WARNING|DP=2|PP=0|TP=4|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 04:10:52 [WARNING|DP=2|PP=0|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 04:10:54 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:07/03/2024 04:11:00 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default4]:07/03/2024 04:11:00 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default4]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default4]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls int[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -o the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:07/03/2024 04:11:05 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default2]:07/03/2024 04:11:05 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default2]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default2]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default4]: warnings.warn( -[default3]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:07/03/2024 04:11:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 567.08MiB. Peak allocated 3184.70MiB. Peak reserved: 3482.00MiB -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:07/03/2024 04:11:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 56.5K | tokens_per_sec: 74.3K | tokens_per_sec_per_gpu: 1.16K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 0.0001 | model_tflops_per_gpu: 10.5 | hardware_tflops_per_gpu: 10.5 | grad_norm: 11.5 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 3.74G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.5G | hd_free_memory_tb: 246G -[default0]:07/03/2024 04:11:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 699.61MiB. Peak allocated 997.66MiB. Peak reserved: 3564.00MiB -[default0]:07/03/2024 04:11:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 699.68MiB. Peak allocated 3284.33MiB. Peak reserved: 3602.00MiB -[default0]:07/03/2024 04:12:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 30.8K | tokens_per_sec: 136K | tokens_per_sec_per_gpu: 2.13K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.53e-05 | model_tflops_per_gpu: 19.3 | hardware_tflops_per_gpu: 19.3 | grad_norm: 11.6 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 3.78G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.5G | hd_free_memory_tb: 246G -[default0]:07/03/2024 04:12:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 699.61MiB. Peak allocated 997.72MiB. Peak reserved: 3602.00MiB -[default0]:07/03/2024 04:12:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 699.68MiB. Peak allocated 3284.33MiB. Peak reserved: 3602.00MiB -[default0]:07/03/2024 04:12:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 30.7K | tokens_per_sec: 137K | tokens_per_sec_per_gpu: 2.14K | global_batch_size: 1.02K | lm_loss: 11.9 | lr: 9.05e-05 | model_tflops_per_gpu: 19.4 | hardware_tflops_per_gpu: 19.4 | grad_norm: 122 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 3.78G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.5G | hd_free_memory_tb: 246G -[default0]:07/03/2024 04:12:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 699.61MiB. Peak allocated 997.72MiB. Peak reserved: 3602.00MiB -[default0]:STAGE:2024-07-03 04:12:35 25415:25415 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 04:13:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 699.68MiB. Peak allocated 3284.33MiB. Peak reserved: 3602.00MiB -[default0]:07/03/2024 04:13:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 31.1K | tokens_per_sec: 135K | tokens_per_sec_per_gpu: 2.11K | global_batch_size: 1.02K | lm_loss: 12.3 | lr: 8.58e-05 | model_tflops_per_gpu: 19.1 | hardware_tflops_per_gpu: 19.1 | grad_norm: 18.3 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 3.78G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.5G | hd_free_memory_tb: 246G -[default0]:07/03/2024 04:13:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 699.61MiB. Peak allocated 997.72MiB. Peak reserved: 3602.00MiB -[default0]:07/03/2024 04:13:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 31.2K | tokens_per_sec: 135K | tokens_per_sec_per_gpu: 2.1K | global_batch_size: 1.02K | lm_loss: 11.2 | lr: 8.11e-05 | model_tflops_per_gpu: 19.1 | hardware_tflops_per_gpu: 19.1 | grad_norm: 29 -[default0]:07/03/2024 04:13:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 699.61MiB. Peak allocated 3284.33MiB. Peak reserved: 3602.00MiB -[default0]:07/03/2024 04:14:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 30.9K | tokens_per_sec: 136K | tokens_per_sec_per_gpu: 2.12K | global_batch_size: 1.02K | lm_loss: 10.2 | lr: 7.63e-05 | model_tflops_per_gpu: 19.2 | hardware_tflops_per_gpu: 19.2 | grad_norm: 10.5 -[default0]:STAGE:2024-07-03 04:15:20 25415:25415 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 04:15:27 25415:25415 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:563] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:563] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:563] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:563] [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:563] [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:563] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:563] [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:563] [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:563] [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:563] [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600027 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600029 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600036 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600035 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600049 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600057 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default0]:[rank40]:[E ProcessGroupNCCL.cpp:563] [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default4]:[rank44]:[E ProcessGroupNCCL.cpp:563] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600041 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600040 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600016 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600037 milliseconds before timing out. -[default3]:[rank43]:[E ProcessGroupNCCL.cpp:563] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600019 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600043 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600022 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600024 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600034 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600025 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600015 milliseconds before timing out. -[default2]:[rank42]:[E ProcessGroupNCCL.cpp:563] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600026 milliseconds before timing out. -[default1]:[rank41]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178347, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 601981 milliseconds before timing out. -[default5]:[rank21]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178317, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602725 milliseconds before timing out. -[default1]:[rank25]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178302, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602886 milliseconds before timing out. -[default3]:[rank27]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178303, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602863 milliseconds before timing out. -[default1]:[rank17]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178301, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602858 milliseconds before timing out. -[default5]:[rank37]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178365, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602650 milliseconds before timing out. -[default2]:[rank34]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178330, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602674 milliseconds before timing out. -[default2]:[rank26]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178300, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602897 milliseconds before timing out. -[default2]:[rank18]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178316, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602987 milliseconds before timing out. -[default0]:[rank16]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178311, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602980 milliseconds before timing out. -[default0]:[rank32]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178327, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602780 milliseconds before timing out. -[default0]:[rank24]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178343, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 603014 milliseconds before timing out. -[default7]:[rank31]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178301, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 603051 milliseconds before timing out. -[default6]:[rank30]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178323, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 603009 milliseconds before timing out. -[default4]:[rank28]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178348, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602977 milliseconds before timing out. -[default7]:[rank23]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178317, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 603072 milliseconds before timing out. -[default6]:[rank22]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178298, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 603047 milliseconds before timing out. -[default3]:[rank19]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178304, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 603046 milliseconds before timing out. -[default6]:[rank46]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178364, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602770 milliseconds before timing out. -[default1]:[rank33]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178367, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602818 milliseconds before timing out. -[default6]:[rank38]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178346, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602861 milliseconds before timing out. -[default7]:[rank47]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178349, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602898 milliseconds before timing out. -[default5]:[rank45]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178351, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602889 milliseconds before timing out. -[default3]:[rank35]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178261, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 603033 milliseconds before timing out. -[default5]:[rank29]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178311, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 603205 milliseconds before timing out. -[default4]:[rank20]:[E ProcessGroupNCCL.cpp:563] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178323, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 603190 milliseconds before timing out. -[default4]:[rank36]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178361, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602968 milliseconds before timing out. -[default7]:[rank39]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178365, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 603056 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178368, OpType=COALESCED, NumelIn=2048, NumelOut=2048, Timeout(ms)=600000) ran for 600021 milliseconds before timing out. -[default2]:[rank50]:[E ProcessGroupNCCL.cpp:563] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600032 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600033 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600030 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600028 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600031 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600020 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600051 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600045 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600023 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600056 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=165, OpType=ALLREDUCE, NumelIn=69449728, NumelOut=69449728, Timeout(ms)=600000) ran for 600042 milliseconds before timing out. -[default1]:[rank49]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178342, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 601927 milliseconds before timing out. -[default7]:[rank55]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178354, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602000 milliseconds before timing out. -[default2]:[rank58]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178334, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602168 milliseconds before timing out. -[default5]:[rank53]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178339, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602313 milliseconds before timing out. -[default7]:[rank63]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178355, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602386 milliseconds before timing out. -[default0]:[rank48]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178342, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602434 milliseconds before timing out. -[default6]:[rank54]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178351, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602453 milliseconds before timing out. -[default4]:[rank60]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178354, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602565 milliseconds before timing out. -[default3]:[rank51]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178344, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 602640 milliseconds before timing out. -[default3]:[rank59]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178355, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602643 milliseconds before timing out. -[default6]:[rank62]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178339, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602668 milliseconds before timing out. -[default0]:[rank56]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178339, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602784 milliseconds before timing out. -[default5]:[rank61]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178359, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 602799 milliseconds before timing out. -[default4]:[rank52]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178353, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 603144 milliseconds before timing out. -[default1]:[rank57]:[E ProcessGroupNCCL.cpp:563] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=178343, OpType=_ALLGATHER_BASE, NumelIn=1048576, NumelOut=16777216, Timeout(ms)=600000) ran for 604133 milliseconds before timing out. -[default0]:07/03/2024 04:25:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 699.61MiB. Peak allocated 3284.33MiB. Peak reserved: 3602.00MiB -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 13] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:577] [Rank 13] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:583] [Rank 13] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 11] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:577] [Rank 11] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:583] [Rank 11] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank11]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f414c757897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f414da30c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:[rank13]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f650df19897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f414da35a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f414da36dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f41994cfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f419e516609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f419e2e1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f650f1f2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f650f1f7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f650f1f8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7f655ac91e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f655fcd8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f655faa3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 13] Process group watchdog thread terminated with exception: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600069 milliseconds before timing out. -[default3]: what(): [PG 2 Rank 11] Process group watchdog thread terminated with exception: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f414c757897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f414da30c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f414da35a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f414da36dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f41994cfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f419e516609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f419e2e1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f650df19897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f414c757897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f414d6ba119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f41994cfe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f650f1f2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f650f1f7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f650f1f8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: + 0x8609 (0x7f419e516609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f419e2e1353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]:frame #4: + 0xd3e95 (0x7f655ac91e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7f655fcd8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7f655faa3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f650df19897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7f650ee7c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]: -[default5]:frame #2: + 0xd3e95 (0x7f655ac91e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7f655fcd8609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7f655faa3353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 10] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:577] [Rank 10] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:583] [Rank 10] To avoid data inconsistency, we are taking the entire process down. -[default2]:[rank10]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f249fdb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24a108bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24a1090a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24a1091dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f24ecb2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f24f1b71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f24f193c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]: what(): [PG 2 Rank 10] Process group watchdog thread terminated with exception: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f249fdb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f24a108bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f24a1090a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f24a1091dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f24ecb2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f24f1b71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f24f193c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f249fdb2897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f24a0d15119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f24ecb2ae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f24f1b71609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f24f193c353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 14] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:577] [Rank 14] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:583] [Rank 14] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank14]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb043a32897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb044d0bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb044d10a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb044d11dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb0907aae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb0957f1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb0955bc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 14] Process group watchdog thread terminated with exception: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb043a32897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fb044d0bc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fb044d10a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fb044d11dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7fb0907aae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7fb0957f1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7fb0955bc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fb043a32897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7fb044995119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7fb0907aae95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7fb0957f1609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7fb0955bc353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 15] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:577] [Rank 15] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:583] [Rank 15] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank15]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f68a4544897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f68a581dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f68a5822a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f68a5823dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f68f12bce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f68f6303609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f68f60ce353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 15] Process group watchdog thread terminated with exception: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600065 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f68a4544897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f68a581dc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f68a5822a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f68a5823dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f68f12bce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f68f6303609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f68f60ce353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f68a4544897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f68a54a7119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f68f12bce95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f68f6303609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f68f60ce353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 8] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:577] [Rank 8] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:583] [Rank 8] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank8]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd97bb89897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd97ce62c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd97ce67a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd97ce68dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd9c8901e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd9cd948609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd9cd713353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 8] Process group watchdog thread terminated with exception: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600067 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd97bb89897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fd97ce62c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fd97ce67a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fd97ce68dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7fd9c8901e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7fd9cd948609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7fd9cd713353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fd97bb89897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7fd97caec119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7fd9c8901e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7fd9cd948609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7fd9cd713353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 9] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:577] [Rank 9] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:583] [Rank 9] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank9]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd54361897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd5563ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd5563fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd55640dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fdda10d9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fdda6120609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fdda5eeb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 9] Process group watchdog thread terminated with exception: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600062 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd54361897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fdd5563ac62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fdd5563fa80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fdd55640dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7fdda10d9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7fdda6120609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7fdda5eeb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fdd54361897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7fdd552c4119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7fdda10d9e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7fdda6120609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7fdda5eeb353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 12] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:577] [Rank 12] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:583] [Rank 12] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank12]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe9a93e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe9aa6c0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe9aa6c5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe9aa6c6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe9f615fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe9fb1a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe9faf71353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: what(): [PG 2 Rank 12] Process group watchdog thread terminated with exception: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600050 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe9a93e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe9aa6c0c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe9aa6c5a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe9aa6c6dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fe9f615fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fe9fb1a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fe9faf71353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe9a93e7897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7fe9aa34a119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fe9f615fe95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #3: + 0x8609 (0x7fe9fb1a6609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #4: clone + 0x43 (0x7fe9faf71353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -W0703 04:27:54.205000 140638446999360 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900057 closing signal SIGTERM -W0703 04:27:54.205000 140638446999360 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900060 closing signal SIGTERM -W0703 04:27:54.205000 140638446999360 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900061 closing signal SIGTERM -W0703 04:27:54.205000 140638446999360 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900062 closing signal SIGTERM -W0703 04:27:54.205000 140638446999360 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900063 closing signal SIGTERM -W0703 04:27:54.206000 140638446999360 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 900064 closing signal SIGTERM -E0703 04:27:55.733000 140638446999360 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 1 (pid: 900058) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:27:54 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : -6 (pid: 900059) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 900059 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:27:54 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : -6 (pid: 900058) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 900058 -============================================================ -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 1] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:577] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:583] [Rank 1] To avoid data inconsistency, we are taking the entire process down. -[default1]:[rank1]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f01430d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f01443b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f01443b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f01443b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f018fe51e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0194e98609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0194c63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 7] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default1]: -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:577] [Rank 7] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:583] [Rank 7] To avoid data inconsistency, we are taking the entire process down. -[default7]:[rank7]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 7] Process group watchdog thread terminated with exception: [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7b76d05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7b77fdec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7b77fe3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7b77fe4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7bc3a7de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f7bc8ac4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f7bc888f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:terminate called after throwing an instance of 'c10::DistBackendError' -[default7]: what(): [PG 2 Rank 7] Process group watchdog thread terminated with exception: [Rank 7] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default7]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7b76d05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f7b77fdec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f7b77fe3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f7b77fe4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #4: + 0xd3e95 (0x7f7bc3a7de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #5: + 0x8609 (0x7f7bc8ac4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #6: clone + 0x43 (0x7f7bc888f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default7]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default7]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7b76d05897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default7]:frame #1: + 0xe32119 (0x7f7b77c68119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default7]:frame #2: + 0xd3e95 (0x7f7bc3a7de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default7]:frame #3: + 0x8609 (0x7f7bc8ac4609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default7]:frame #4: clone + 0x43 (0x7f7bc888f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default7]: -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 2] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:577] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:583] [Rank 2] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 4] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:577] [Rank 4] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default2]:[rank2]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:583] [Rank 4] To avoid data inconsistency, we are taking the entire process down. -[default4]:[rank4]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 4] Process group watchdog thread terminated with exception: [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c38186897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faac9ed5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c3945fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faacb1aec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faacb1b3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faacb1b4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fab16c4de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fab1bc94609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c39464a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #6: clone + 0x43 (0x7fab1ba5f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c39465dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3c84efee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:terminate called after throwing an instance of 'c10::DistBackendError' -[default2]:frame #5: + 0x8609 (0x7f3c89f45609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]: what(): [PG 2 Rank 4] Process group watchdog thread terminated with exception: [Rank 4] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600072 milliseconds before timing out. -[default4]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faac9ed5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7faacb1aec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7faacb1b3a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7faacb1b4dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #4: + 0xd3e95 (0x7fab16c4de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default4]:frame #5: + 0x8609 (0x7fab1bc94609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default4]:frame #6: clone + 0x43 (0x7fab1ba5f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]: -[default4]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default4]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7faac9ed5897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default4]:frame #1: + 0xe32119 (0x7faacae38119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default4]:frame #2: + 0xd3e95 (0x7fab16c4de95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #6: clone + 0x43 (0x7f3c89d10353 in /lib/x86_64-linux-gnu/libc.so.6) -[default4]:frame #3: + 0x8609 (0x7fab1bc94609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]: -[default4]:frame #4: clone + 0x43 (0x7fab1ba5f353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]:terminate called after throwing an instance of 'c10::DistBackendError' -[default4]: -[default2]: what(): [PG 2 Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600053 milliseconds before timing out. -[default2]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c38186897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f3c3945fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f3c39464a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f3c39465dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #4: + 0xd3e95 (0x7f3c84efee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #5: + 0x8609 (0x7f3c89f45609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #6: clone + 0x43 (0x7f3c89d10353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default2]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default2]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f3c38186897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default2]:frame #1: + 0xe32119 (0x7f3c390e9119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default2]:frame #2: + 0xd3e95 (0x7f3c84efee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default2]:frame #3: + 0x8609 (0x7f3c89f45609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default2]:frame #4: clone + 0x43 (0x7f3c89d10353 in /lib/x86_64-linux-gnu/libc.so.6) -[default2]: -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 5] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:577] [Rank 5] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:583] [Rank 5] To avoid data inconsistency, we are taking the entire process down. -[default5]:[rank5]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 5] Process group watchdog thread terminated with exception: [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe618d6b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe61a044c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe61a049a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe61a04adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe665ae3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe66ab2a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe66a8f5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:terminate called after throwing an instance of 'c10::DistBackendError' -[default5]: what(): [PG 2 Rank 5] Process group watchdog thread terminated with exception: [Rank 5] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600099 milliseconds before timing out. -[default5]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe618d6b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7fe61a044c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7fe61a049a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7fe61a04adcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #4: + 0xd3e95 (0x7fe665ae3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #5: + 0x8609 (0x7fe66ab2a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #6: clone + 0x43 (0x7fe66a8f5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default5]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default5]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fe618d6b897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default5]:frame #1: + 0xe32119 (0x7fe619cce119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default5]:frame #2: + 0xd3e95 (0x7fe665ae3e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default5]:frame #3: + 0x8609 (0x7fe66ab2a609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default5]:frame #4: clone + 0x43 (0x7fe66a8f5353 in /lib/x86_64-linux-gnu/libc.so.6) -[default5]: -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 3] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:577] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:583] [Rank 3] To avoid data inconsistency, we are taking the entire process down. -[default3]:[rank3]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7461fc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f746329fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f74632a4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f74632a5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f74aed3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f74b3d85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f74b3b50353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:terminate called after throwing an instance of 'c10::DistBackendError' -[default3]: what(): [PG 2 Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600047 milliseconds before timing out. -[default3]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7461fc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f746329fc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f74632a4a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f74632a5dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #4: + 0xd3e95 (0x7f74aed3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #5: + 0x8609 (0x7f74b3d85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #6: clone + 0x43 (0x7f74b3b50353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default3]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default3]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f7461fc6897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default3]:frame #1: + 0xe32119 (0x7f7462f29119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default3]:frame #2: + 0xd3e95 (0x7f74aed3ee95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default3]:frame #3: + 0x8609 (0x7f74b3d85609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default3]:frame #4: clone + 0x43 (0x7f74b3b50353 in /lib/x86_64-linux-gnu/libc.so.6) -[default3]: -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 6] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:577] [Rank 6] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:583] [Rank 6] To avoid data inconsistency, we are taking the entire process down. -[default6]:[rank6]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 6] Process group watchdog thread terminated with exception: [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47ce265897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47cf53ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47cf543a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47cf544dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f481afdde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4820024609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f481fdef353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:terminate called after throwing an instance of 'c10::DistBackendError' -[default6]: what(): [PG 2 Rank 6] Process group watchdog thread terminated with exception: [Rank 6] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600074 milliseconds before timing out. -[default6]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47ce265897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f47cf53ec62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f47cf543a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f47cf544dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #4: + 0xd3e95 (0x7f481afdde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #5: + 0x8609 (0x7f4820024609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #6: clone + 0x43 (0x7f481fdef353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default6]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default6]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f47ce265897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default6]:frame #1: + 0xe32119 (0x7f47cf1c8119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default6]:frame #2: + 0xd3e95 (0x7f481afdde95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default6]:frame #3: + 0x8609 (0x7f4820024609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default6]:frame #4: clone + 0x43 (0x7f481fdef353 in /lib/x86_64-linux-gnu/libc.so.6) -[default6]: -[default1]:terminate called after throwing an instance of 'c10::DistBackendError' -[default1]: what(): [PG 2 Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600079 milliseconds before timing out. -[default1]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f01430d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f01443b2c62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f01443b7a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f01443b8dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #4: + 0xd3e95 (0x7f018fe51e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #5: + 0x8609 (0x7f0194e98609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #6: clone + 0x43 (0x7f0194c63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default1]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default1]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f01430d9897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default1]:frame #1: + 0xe32119 (0x7f014403c119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default1]:frame #2: + 0xd3e95 (0x7f018fe51e95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default1]:frame #3: + 0x8609 (0x7f0194e98609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default1]:frame #4: clone + 0x43 (0x7f0194c63353 in /lib/x86_64-linux-gnu/libc.so.6) -[default1]: -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 2 Rank 0] Timeout at NCCL work: 152896, last enqueued NCCL work: 153014, last completed NCCL work: 152895. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. -[default0]:[rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 2 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54770c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f547839cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54783a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54783a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f54c3e3be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f54c8e82609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f54c8c4d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:terminate called after throwing an instance of 'c10::DistBackendError' -[default0]: what(): [PG 2 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=152896, OpType=_REDUCE_SCATTER_BASE, NumelIn=16777216, NumelOut=1048576, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. -[default0]:Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:565 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54770c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional > >) + 0x1d2 (0x7f547839cc62 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x1a0 (0x7f54783a1a80 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x10c (0x7f54783a2dcc in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #4: + 0xd3e95 (0x7f54c3e3be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #5: + 0x8609 (0x7f54c8e82609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #6: clone + 0x43 (0x7f54c8c4d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -[default0]:Exception raised from ncclCommWatchdog at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1418 (most recent call first): -[default0]:frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f54770c3897 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libc10.so) -[default0]:frame #1: + 0xe32119 (0x7f5478026119 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so) -[default0]:frame #2: + 0xd3e95 (0x7f54c3e3be95 in /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/../lib/libstdc++.so.6) -[default0]:frame #3: + 0x8609 (0x7f54c8e82609 in /lib/x86_64-linux-gnu/libpthread.so.0) -[default0]:frame #4: clone + 0x43 (0x7f54c8c4d353 in /lib/x86_64-linux-gnu/libc.so.6) -[default0]: -E0703 04:38:39.840000 139953438979904 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: -6) local_rank: 0 (pid: 25415) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_04:38:39 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : -6 (pid: 25416) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25416 -[2]: - time : 2024-07-03_04:38:39 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : -6 (pid: 25417) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25417 -[3]: - time : 2024-07-03_04:38:39 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : -6 (pid: 25418) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25418 -[4]: - time : 2024-07-03_04:38:39 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : -6 (pid: 25419) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25419 -[5]: - time : 2024-07-03_04:38:39 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : -6 (pid: 25420) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25420 -[6]: - time : 2024-07-03_04:38:39 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : -6 (pid: 25421) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25421 -[7]: - time : 2024-07-03_04:38:39 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : -6 (pid: 25422) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25422 ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_04:38:39 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : -6 (pid: 25415) - error_file: - traceback : Signal 6 (SIGABRT) received by PID 25415 -============================================================ -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -W0703 04:38:43.935000 139755525015296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-153.ec2.internal_1451450_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:43.988000 140126129862400 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_1176082_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:43.990000 140275510273792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_913723_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:43.996000 139730121897728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3925350_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:44.057000 139971917739776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-138.ec2.internal_695729_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:44.164000 140503224817408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3796129_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:44.687000 140281171007296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 913793 closing signal SIGTERM -W0703 04:38:44.688000 140281171007296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 913794 closing signal SIGTERM -W0703 04:38:44.688000 140281171007296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 913795 closing signal SIGTERM -W0703 04:38:44.688000 140281171007296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 913796 closing signal SIGTERM -W0703 04:38:44.691000 140281171007296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 913797 closing signal SIGTERM -W0703 04:38:44.693000 140281171007296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 913798 closing signal SIGTERM -W0703 04:38:44.693000 140281171007296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 913799 closing signal SIGTERM -W0703 04:38:44.693000 140281171007296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 913800 closing signal SIGTERM -W0703 04:38:44.734000 139761185748800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1451519 closing signal SIGTERM -W0703 04:38:44.735000 139761185748800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1451520 closing signal SIGTERM -W0703 04:38:44.735000 139761185748800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1451521 closing signal SIGTERM -W0703 04:38:44.737000 139761185748800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1451522 closing signal SIGTERM -W0703 04:38:44.737000 139761185748800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1451523 closing signal SIGTERM -W0703 04:38:44.737000 139761185748800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1451524 closing signal SIGTERM -W0703 04:38:44.739000 139761185748800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1451525 closing signal SIGTERM -W0703 04:38:44.739000 139761185748800 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1451526 closing signal SIGTERM -W0703 04:38:44.746000 139977578473280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 695798 closing signal SIGTERM -W0703 04:38:44.746000 140508885550912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3796198 closing signal SIGTERM -W0703 04:38:44.746000 139977578473280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 695799 closing signal SIGTERM -W0703 04:38:44.746000 140508885550912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3796199 closing signal SIGTERM -W0703 04:38:44.746000 140508885550912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3796200 closing signal SIGTERM -W0703 04:38:44.746000 139977578473280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 695800 closing signal SIGTERM -W0703 04:38:44.747000 139977578473280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 695801 closing signal SIGTERM -W0703 04:38:44.748000 140508885550912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3796201 closing signal SIGTERM -W0703 04:38:44.747000 139977578473280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 695802 closing signal SIGTERM -W0703 04:38:44.748000 140508885550912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3796202 closing signal SIGTERM -W0703 04:38:44.748000 140508885550912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3796203 closing signal SIGTERM -W0703 04:38:44.748000 139977578473280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 695803 closing signal SIGTERM -W0703 04:38:44.748000 139977578473280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 695804 closing signal SIGTERM -W0703 04:38:44.749000 139977578473280 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 695805 closing signal SIGTERM -W0703 04:38:44.750000 140508885550912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3796204 closing signal SIGTERM -W0703 04:38:44.750000 140508885550912 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3796205 closing signal SIGTERM -W0703 04:38:44.798000 139735782631232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3925419 closing signal SIGTERM -W0703 04:38:44.798000 139735782631232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3925420 closing signal SIGTERM -W0703 04:38:44.798000 139735782631232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3925421 closing signal SIGTERM -W0703 04:38:44.798000 139735782631232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3925422 closing signal SIGTERM -W0703 04:38:44.801000 139735782631232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3925423 closing signal SIGTERM -W0703 04:38:44.801000 139735782631232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3925424 closing signal SIGTERM -W0703 04:38:44.801000 139735782631232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3925425 closing signal SIGTERM -W0703 04:38:44.803000 139735782631232 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3925426 closing signal SIGTERM -W0703 04:38:44.808000 140131790595904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1176151 closing signal SIGTERM -W0703 04:38:44.808000 140131790595904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1176152 closing signal SIGTERM -W0703 04:38:44.808000 140131790595904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1176153 closing signal SIGTERM -W0703 04:38:44.809000 140131790595904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1176154 closing signal SIGTERM -W0703 04:38:44.809000 140131790595904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1176155 closing signal SIGTERM -W0703 04:38:44.810000 140131790595904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1176156 closing signal SIGTERM -W0703 04:38:44.810000 140131790595904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1176157 closing signal SIGTERM -W0703 04:38:44.810000 140131790595904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1176158 closing signal SIGTERM -W0703 04:38:48.943000 139755525015296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-153.ec2.internal_1451450_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:48.992000 140126129862400 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_1176082_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:48.994000 140275510273792 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-88.ec2.internal_913723_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:49.001000 139730121897728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3925350_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:49.062000 139971917739776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-138.ec2.internal_695729_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:49.169000 140503224817408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3796129_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:52.352000 140281171007296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_913723_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:52.369000 140281171007296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-88.ec2.internal_913723_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -W0703 04:38:53.947000 139755525015296 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-153.ec2.internal_1451450_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:53.997000 140126129862400 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-78.ec2.internal_1176082_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:54.005000 139730121897728 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-62.ec2.internal_3925350_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:54.066000 139971917739776 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-161-138.ec2.internal_695729_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:54.173000 140503224817408 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3796129_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:56.222000 139761185748800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1451450_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:56.238000 139761185748800 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-153.ec2.internal_1451450_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 04:38:56.282000 139735782631232 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3925350_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:56.299000 139735782631232 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-62.ec2.internal_3925350_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 04:38:56.316000 139977578473280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_695729_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:56.332000 139977578473280 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-138.ec2.internal_695729_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 04:38:56.588000 140508885550912 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3796129_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:56.604000 140508885550912 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3796129_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -W0703 04:38:56.624000 140131790595904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1176082_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:38:56.639000 140131790595904 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-161-78.ec2.internal_1176082_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) -torch.distributed.DistNetworkError: Broken pipe - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent - result = agent.run() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper - result = f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run - result = self._invoke_run(role) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run - num_nodes_waiting = rdzv_handler.num_nodes_waiting() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting - self._state_holder.sync() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync - get_response = self._backend.get_state() - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state - base64_state: bytes = self._call_store("get", self._key) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: ip-26-0-161-153: task 4: Exited with exit code 1 -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-161-138: task 3: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-160-225_25415.1719980578573025480.pt.trace.json: 0%| | 0.00/17.7G [00:00 $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256 llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256 --commit-message "Upload llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/config.yaml b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/config.yaml deleted file mode 100644 index 75580769785d0a72eb606f902b26dd5a37cfb3be..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 4 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 1 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 256 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/log.out b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/log.out deleted file mode 100644 index 25d31568167b36acbb4e05654250d9e07cbcbb00..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/log.out +++ /dev/null @@ -1,4171 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:30:36 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:30:42.361000 140146482988864 torch/distributed/run.py:757] -W0703 03:30:42.361000 140146482988864 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.361000 140146482988864 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:30:42.361000 140146482988864 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.392000 139727339403072 torch/distributed/run.py:757] -W0703 03:30:42.392000 139727339403072 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.392000 139727339403072 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:30:42.392000 139727339403072 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.427000 140044900476736 torch/distributed/run.py:757] -W0703 03:30:42.427000 140044900476736 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.427000 140044900476736 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:30:42.427000 140044900476736 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.427000 139748038281024 torch/distributed/run.py:757] -W0703 03:30:42.427000 139748038281024 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.427000 139748038281024 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:30:42.427000 139748038281024 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.603000 140382654310208 torch/distributed/run.py:757] -W0703 03:30:42.603000 140382654310208 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.603000 140382654310208 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:30:42.603000 140382654310208 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.653000 140253283452736 torch/distributed/run.py:757] -W0703 03:30:42.653000 140253283452736 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.653000 140253283452736 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:30:42.653000 140253283452736 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.658000 140002523572032 torch/distributed/run.py:757] -W0703 03:30:42.658000 140002523572032 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.658000 140002523572032 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:30:42.658000 140002523572032 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.694000 139885389195072 torch/distributed/run.py:757] -W0703 03:30:42.694000 139885389195072 torch/distributed/run.py:757] ***************************************** -W0703 03:30:42.694000 139885389195072 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:30:42.694000 139885389195072 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:31:07 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config: -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: run='%date_%jobid', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: step=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: consumed_train_samples=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: benchmark_csv_path=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: parallelism=ParallelismArgs(dp=4, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp=16, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pp_engine=, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_mode=, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: expert_parallel_size=1), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50272), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_revision=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokenizer_max_length=None), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoint_interval=100000, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: save_initial_state=False, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: log_level_replica='info', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: train_steps=20, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: micro_batch_size=256, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: batch_accumulation_per_replica=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: val_check_interval=-1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_val_batches=0, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: limit_test_batches=0), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta1=0.9, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: adam_beta2=0.95, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: name='adamW'), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: zero_stage=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: weight_decay=0.01, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: clip_grad=1.0, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_steps=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_warmup_style='linear', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_style='linear', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_steps=19, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: start_training_step=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_splits='train', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: text_column_name='text'), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: seed=42, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_loading_workers=0))], -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256')), -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: lighteval=None) -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Model Config: -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: eos_token_id=2, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_act='silu', -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: hidden_size=2048, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: initializer_range=0.02, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: intermediate_size=4096, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: is_llama_config=True, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: max_position_embeddings=4096, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_attention_heads=32, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_hidden_layers=24, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: num_key_value_heads=32, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pad_token_id=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: pretraining_tp=1, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_scaling=None, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: rope_theta=10000.0, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: tie_word_embeddings=True, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: use_cache=True, -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: vocab_size=50272) -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Building model.. -[default0]:07/03/2024 03:31:07 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Setting PP block ranks... -[default0]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-138]: No checkpoint path provided. -[default5]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-138]: No checkpoint path provided. -[default2]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-138]: No checkpoint path provided. -[default4]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-138]: No checkpoint path provided. -[default1]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-138]: No checkpoint path provided. -[default6]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-138]: No checkpoint path provided. -[default7]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-138]: No checkpoint path provided. -[default3]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Total number of parameters: 1.11G (2119.44MiB) -[default0]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Parametrizing model parameters using StandardParametrizator -[default6]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-103]: No checkpoint path provided. -[default7]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 03:31:24 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-103]: No checkpoint path provided. -[default1]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default2]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default4]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default5]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default6]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 03:31:24 [INFO|DP=3|PP=0|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default0]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=8|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=11|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=10|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=9|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=14|ip-26-0-161-78]: No checkpoint path provided. -[default5]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=13|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=12|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=15|ip-26-0-161-78]: No checkpoint path provided. -[default6]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=6|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=4|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=5|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=7|ip-26-0-161-153]: No checkpoint path provided. -[default1]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 03:31:24 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=0|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=1|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=3|ip-26-0-166-125]: No checkpoint path provided. -[default6]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=6|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=5|ip-26-0-166-125]: No checkpoint path provided. -[default2]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=2|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=4|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=7|ip-26-0-166-125]: No checkpoint path provided. -[default6]:07/03/2024 03:31:25 [INFO|DP=2|PP=0|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 03:31:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:31:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:31:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] DP Rank 0 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:31:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] DP Rank 1 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:31:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] DP Rank 2 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:31:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [ZeRO sharding] DP Rank 3 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:31:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:31:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Using `datasets` library -[default0]:07/03/2024 03:31:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 03:31:28 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:31:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:31:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:31:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: -[default0]:07/03/2024 03:31:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: [Start training] datetime: 2024-07-03 03:31:29.078953 | mbs: 256 | grad_accum: 1 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:31:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:31:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-103]: Memory usage: 490.87MiB. Peak allocated 490.87MiB. Peak reserved: 512.00MiB -[default3]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=9|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=15|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=1|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=2|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=4|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=13|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=14|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=12|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=0|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=3|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=6|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=5|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=7|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=2|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=1|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=3|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=5|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=4|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=4|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=5|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=7|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=8|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:31:29 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-138]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=6|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=6|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:31:29 [WARNING|DP=3|PP=0|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:31:29 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:31:29 [WARNING|DP=2|PP=0|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: Traceback (most recent call last): -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: Traceback (most recent call last): -[default6]:[rank14]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: trainer.train(dataloader) -[default7]:[rank15]: trainer.train(dataloader) -[default1]:[rank9]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default4]:[rank12]: sharded_logits = self.model( -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: output = model(**micro_batch) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default1]:[rank9]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: trainer.train(dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: output = self.o_proj(attention_output) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1019.94 MiB is free. Including non-PyTorch memory, this process has 78.32 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank13]: Traceback (most recent call last): -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank3]: output = model(**micro_batch) -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank3]: sharded_logits = self.model( -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: output = self.o_proj(attention_output) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank14]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1019.94 MiB is free. Including non-PyTorch memory, this process has 78.32 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default3]:[rank3]: output = self.o_proj(attention_output) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return row_linear( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1019.94 MiB is free. Including non-PyTorch memory, this process has 78.32 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank10]: output = self.o_proj(attention_output) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: output = self.o_proj(attention_output) -[default5]:[rank13]: sharded_logits = self.model( -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: Traceback (most recent call last): -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default4]:[rank4]: Traceback (most recent call last): -[default1]:[rank9]: return row_linear( -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank14]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: return row_linear( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank14]: out = F.linear(input, weight, bias) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank10]: return row_linear( -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: output = model(**micro_batch) -[default0]:[rank0]: output = model(**micro_batch) -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 419.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank12]: out = F.linear(input, weight, bias) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank10]: out = F.linear(input, weight, bias) -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 419.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 507.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 419.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: sharded_logits = self.model( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: return row_linear( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank8]: return row_linear( -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 507.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: out = F.linear(input, weight, bias) -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: output = model(**micro_batch) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 507.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: trainer.train(dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: sharded_logits = self.model( -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: output = model(**micro_batch) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: output = self.o_proj(attention_output) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default0]:[rank0]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank4]: return row_linear( -[default1]:[rank1]: return row_linear( -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.08 GiB is free. Including non-PyTorch memory, this process has 78.24 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1019.94 MiB is free. Including non-PyTorch memory, this process has 78.32 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank6]: return row_linear( -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.08 GiB is free. Including non-PyTorch memory, this process has 78.24 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/na[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank11]: output = self.o_proj(attention_output) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank11]: return row_linear( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank11]: out = F.linear(input, weight, bias) -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 507.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Menotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -mory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank2]: sharded_logits = self.model( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank2]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.08 GiB is free. Including non-PyTorch memory, this process has 78.24 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank58]: output = self.pp_block(**new_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank58]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank58]: output = self.o_proj(attention_output) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank58]: return row_linear( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank58]: out = F.linear(input, weight, bias) -[default2]:[rank58]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1019.94 MiB is free. Including non-PyTorch memory, this process has 78.32 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank59]: output = self.pp_block(**new_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank59]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank59]: output = self.o_proj(attention_output) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank59]: return row_linear( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank59]: out = F.linear(input, weight, bias) -[default3]:[rank59]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.08 GiB is free. Including non-PyTorch memory, this process has 78.24 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default0]:[rank56]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: sharded_logits = self.model( -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: output = self.pp_block(**new_kwargs) -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank62]: output = self.o_proj(attention_output) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank56]: output = self.pp_block(**new_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank56]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return row_linear( -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank62]: out = F.linear(input, weight, bias) -[default0]:[rank56]: output = self.o_proj(attention_output) -[default6]:[rank62]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1019.94 MiB is free. Including non-PyTorch memory, this process has 78.32 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank56]: return row_linear( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank56]: out = F.linear(input, weight, bias) -[default0]:[rank56]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: trainer.train(dataloader) -[default1]:[rank49]: output = model(**micro_batch) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: sharded_logits = self.model( -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: output = self.pp_block(**new_kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default1]:[rank49]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank49]: output = self.o_proj(attention_output) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank52]: output = self.pp_block(**new_kwargs) -[default1]:[rank49]: return row_linear( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: out = F.linear(input, weight, bias) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 419.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank52]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank52]: output = self.o_proj(attention_output) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank52]: return row_linear( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank52]: out = F.linear(input, weight, bias) -[default4]:[rank52]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 507.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank57]: output = self.pp_block(**new_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank57]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank57]: output = self.o_proj(attention_output) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank57]: return row_linear( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank57]: out = F.linear(input, weight, bias) -[default1]:[rank57]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.08 GiB is free. Including non-PyTorch memory, this process has 78.24 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default7]:[rank63]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: trainer.train(dataloader) -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = model(**micro_batch) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default5]:[rank61]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default4]:[rank60]: sharded_logits = self.model( -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank63]: output = self.pp_block(**new_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank61]: output = self.pp_block(**new_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank60]: output = self.pp_block(**new_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank63]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank61]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default7]:[rank63]: output = self.o_proj(attention_output) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: output = self.o_proj(attention_output) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: output = self.o_proj(attention_output) -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank51]: trainer.train(dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default5]:[rank61]: return self._call_impl(*args, **kwargs) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank51]: output = self.pp_block(**new_kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank[default4]:[rank60]: return forward_call(*args, **kwargs) -51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank51]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank51]: output = self.o_proj(attention_output) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluste[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -r/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank51]: return row_linear( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank51]: out = F.linear(input, weight, bias) -[default3]:[rank51]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 419.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB [default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank60]: return row_linear( -memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank63]: return row_linear( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank61]: return row_linear( -[default4]:[rank60]: out = F.linear(input, weight, bias) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank63]: out = F.linear(input, weight, bias) -[default4]:[rank60]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1019.94 MiB is free. Including non-PyTorch memory, this process has 78.32 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank61]: out = F.linear(input, weight, bias) -[default7]:[rank63]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.08 GiB is free. Including non-PyTorch memory, this process has 78.24 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank61]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.08 GiB is free. Including non-PyTorch memory, this process has 78.24 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank55]: output = self.pp_block(**new_kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank55]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank55]: output = self.o_proj(attention_output) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank55]: return row_linear( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank55]: out = F.linear(input, weight, bias) -[default7]:[rank55]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 419.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default2]:[rank50]: sharded_logits = self.model( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: output = self.pp_block(**new_kwargs) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank50]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank48]: Traceback (most recent call last): -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: output = self.pp_block(**new_kwargs) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank50]: output = self.o_proj(attention_output) -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: return row_linear( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank48]: output = model(**micro_batch) -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank50]: out = F.linear(input, weight, bias) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 507.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: output = self.o_proj(attention_output) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank53]: return row_linear( -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank53]: out = F.linear(input, weight, bias) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 419.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank48]: output = self.pp_block(**new_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank54]: output = self.pp_block(**new_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank54]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank48]: output = self.o_proj(attention_output) -[default6]:[rank54]: output = self.o_proj(attention_output) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank54]: return row_linear( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank48]: return row_linear( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank48]: out = F.linear(input, weight, bias) -[default0]:[rank48]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default6]:[rank54]: out = F.linear(input, weight, bias) -[default6]:[rank54]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 507.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -W0703 03:31:48.866000 139727339403072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890159 closing signal SIGTERM -W0703 03:31:48.867000 139727339403072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890161 closing signal SIGTERM -W0703 03:31:48.867000 139727339403072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890162 closing signal SIGTERM -W0703 03:31:48.867000 139727339403072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890163 closing signal SIGTERM -W0703 03:31:48.867000 139727339403072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 890164 closing signal SIGTERM -E0703 03:31:50.187000 139727339403072 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 890158) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:31:48 - host : ip-26-0-161-103.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 890160) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:31:48 - host : ip-26-0-161-103.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 890165) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:31:48 - host : ip-26-0-161-103.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 890158) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-103: task 1: Exited with exit code 1 -W0703 03:31:53.861000 140253283452736 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 686107 closing signal SIGTERM -E0703 03:31:54.092000 140253283452736 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 686104) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:31:53 - host : ip-26-0-161-138.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 686105) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:31:53 - host : ip-26-0-161-138.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 686106) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:31:53 - host : ip-26-0-161-138.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 686108) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:31:53 - host : ip-26-0-161-138.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 686109) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:31:53 - host : ip-26-0-161-138.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 686110) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:31:53 - host : ip-26-0-161-138.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 686111) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:31:53 - host : ip-26-0-161-138.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 686104) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-161-138: task 2: Exited with exit code 1 -W0703 03:32:03.879000 140044900476736 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3915093 closing signal SIGTERM -W0703 03:32:03.880000 140044900476736 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3915094 closing signal SIGTERM -W0703 03:32:03.880000 140044900476736 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3915096 closing signal SIGTERM -W0703 03:32:03.880000 140044900476736 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3915097 closing signal SIGTERM -W0703 03:32:03.880000 140044900476736 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3915098 closing signal SIGTERM -E0703 03:32:04.003000 140002523572032 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 903453) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 903454) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 903455) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 903456) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 903457) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 903458) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 903459) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 903460) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 903453) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -E0703 03:32:05.007000 140044900476736 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3915091) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 3915092) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 3915095) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:32:03 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 3915091) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: Traceback (most recent call last): -[default1]:[rank33]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: trainer.train(dataloader) -[default2]:[rank34]: trainer.train(dataloader) -[default1]:[rank33]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: output = model(**micro_batch) -[default3]:[rank35]: output = model(**micro_batch) -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default1]:[rank33]: output = model(**micro_batch) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: sharded_logits = self.model( -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: sharded_logits = self.model( -[default2]:[rank34]: sharded_logits = self.model( -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default0]:[rank32]: output = self.pp_block(**new_kwargs) -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: Traceback (most recent call last): -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: output = self.pp_block(**new_kwargs) -[default2]:[rank34]: output = self.pp_block(**new_kwargs) -[default4]:[rank44]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: trainer.train(dataloader) -[default7]:[rank47]: output = model(**micro_batch) -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: output = model(**micro_batch) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default1]:[rank33]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: sharded_logits = self.model( -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: sharded_logits = self.model( -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default0]:[rank32]: output = self.o_proj(attention_output) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: output = self.o_proj(attention_output) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: output = self.pp_block(**new_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default2]:[rank34]: output = self.o_proj(attention_output) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return row_linear( -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: out = F.linear(input, weight, bias) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default2]:[rank34]: return row_linear( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank34]: out = F.linear(input, weight, bias) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return row_linear( -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 427.94 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: output = self.pp_block(**new_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: out = F.linear(input, weight, bias) -[default2]:[rank42]: output = self.pp_block(**new_kwargs) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default1]:[rank33]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 259.94 MiB is free. Including non-PyTorch memory, this process has 79.06 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank35]: output = self.o_proj(attention_output) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank35]: return row_linear( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank35]: out = F.linear(input, weight, bias) -[default3]:[rank35]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 259.94 MiB is free. Including non-PyTorch memory, this process has 79.06 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: trainer.train(dataloader) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank36]: Traceback (most recent call last): -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: output = self.pp_block(**new_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: trainer.train(dataloader) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank42]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: output = model(**micro_batch) -[default7]:[rank47]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default4]:[rank36]: sharded_logits = self.model( -[default2]:[rank42]: output = self.o_proj(attention_output) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: output = self.o_proj(attention_output) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank44]: output = self.o_proj(attention_output) -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank39]: output = self.pp_block(**new_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank39]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: output = self.pp_block(**new_kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: output = self.o_proj(attention_output) -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank36]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank36]: output = self.o_proj(attention_output) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return row_linear( -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return row_linear( -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank39]: out = F.linear(input, weight, bias) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: return row_linear( -[default7]:[rank39]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 259.94 MiB is free. Including non-PyTorch memory, this process has 79.06 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default2]:[rank42]: out = F.linear(input, weight, bias) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank36]: return row_linear( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank44]: out = F.linear(input, weight, bias) -[default4]:[rank36]: out = F.linear(input, weight, bias) -[default7]:[rank47]: return row_linear( -[default4]:[rank36]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 427.94 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank42]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 379.94 MiB is free. Including non-PyTorch memory, this process has 78.95 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank44]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 379.94 MiB is free. Including non-PyTorch memory, this process has 78.95 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: out = F.linear(input, weight, bias) -[default7]:[rank47]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 467.94 MiB is free. Including non-PyTorch memory, this process has 78.86 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: Traceback (most recent call last): -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank46]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: trainer.train(dataloader) -[default6]:[rank46]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank46]: output = self.pp_block(**new_kwargs) -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank46]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank46]: output = self.o_proj(attention_output) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank43]: output = self.pp_block(**new_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return row_linear( -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank46]: out = F.linear(input, weight, bias) -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default6]:[rank46]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 379.94 MiB is free. Including non-PyTorch memory, this process has 78.95 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank43]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank43]: output = self.o_proj(attention_output) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank43]: return row_linear( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank43]: out = F.linear(input, weight, bias) -[default3]:[rank43]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 467.94 MiB is free. Including non-PyTorch memory, this process has 78.86 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank41]: Traceback (most recent call last): -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: trainer.train(dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank40]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: sharded_logits = self.model( -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: sharded_logits = self.model( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank41]: output = self.pp_block(**new_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank40]: output = self.pp_block(**new_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default1]:[rank41]: output = self.o_proj(attention_output) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank40]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default1]:[rank41]: return row_linear( -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank40]: output = self.o_proj(attention_output) -[default1]:[rank41]: out = F.linear(input, weight, bias) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 467.94 MiB is free. Including non-PyTorch memory, this process has 78.86 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank40]: return row_linear( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank40]: out = F.linear(input, weight, bias) -[default0]:[rank40]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default5]:[rank45]: Traceback (most recent call last): -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank45]: sharded_logits = self.model( -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank45]: output = self.pp_block(**new_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank45]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank45]: output = self.o_proj(attention_output) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank45]: return row_linear( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank45]: out = F.linear(input, weight, bias) -[default5]:[rank45]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 467.94 MiB is free. Including non-PyTorch memory, this process has 78.86 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: trainer.train(dataloader) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: output = model(**micro_batch) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: output = model(**micro_batch) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank38]: output = self.pp_block(**new_kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank38]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: output = self.pp_block(**new_kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default6]:[rank38]: output = self.o_proj(attention_output) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank37]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank38]: return row_linear( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank37]: output = self.o_proj(attention_output) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: out = F.linear(input, weight, bias) -[default6]:[rank38]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 427.94 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank37]: return row_linear( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank37]: out = F.linear(input, weight, bias) -[default5]:[rank37]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 259.94 MiB is free. Including non-PyTorch memory, this process has 79.06 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank25]: Traceback (most recent call last): -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank25]: trainer.train(dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank25]: output = self.o_proj(attention_output) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank25]: return row_linear( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank25]: out = F.linear(input, weight, bias) -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 427.94 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank30]: output = self.o_proj(attention_output) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank30]: return row_linear( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank30]: out = F.linear(input, weight, bias) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 259.94 MiB is free. Including non-PyTorch memory, this process has 79.06 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank20]: output = self.o_proj(attention_output) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank20]: return row_linear( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank20]: out = F.linear(input, weight, bias) -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 467.94 MiB is free. Including non-PyTorch memory, this process has 78.86 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank18]: output = self.o_proj(attention_output) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank18]: return row_linear( -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank18]: out = F.linear(input, weight, bias) -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 467.94 MiB is free. Including non-PyTorch memory, this process has 78.86 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: output = self.pp_block(**new_kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank17]: output = self.o_proj(attention_output) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank17]: return row_linear( -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank17]: out = F.linear(input, weight, bias) -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 379.94 MiB is free. Including non-PyTorch memory, this process has 78.95 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank23]: output = self.o_proj(attention_output) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank23]: return row_linear( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank23]: out = F.linear(input, weight, bias) -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 379.94 MiB is free. Including non-PyTorch memory, this process has 78.95 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default3]:[rank19]: trainer.train(dataloader) -[default6]:[rank22]: Traceback (most recent call last): -[default5]:[rank21]: Traceback (most recent call last): -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank21]: trainer.train(dataloader) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: trainer.train(dataloader) -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank21]: output = model(**micro_batch) -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: output = model(**micro_batch) -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: output = model(**micro_batch) -[default3]:[rank19]: output = model(**micro_batch) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: sharded_logits = self.model( -[default3]:[rank19]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: output = self.o_proj(attention_output) -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank19]: return row_linear( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank19]: out = F.linear(input, weight, bias) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 379.94 MiB is free. Including non-PyTorch memory, this process has 78.95 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: output = self.o_proj(attention_output) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank16]: output = self.o_proj(attention_output) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank21]: output = self.o_proj(attention_output) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return row_linear( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default0]:[rank16]: out = F.linear(input, weight, bias) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default6]:[rank22]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: out = F.linear(input, weight, bias) -[default5]:[rank21]: return row_linear( -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 467.94 MiB is free. Including non-PyTorch memory, this process has 78.86 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: out = F.linear(input, weight, bias) -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 379.94 MiB is free. Including non-PyTorch memory, this process has 78.95 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: trainer.train(dataloader) -[default0]:[rank24]: sharded_logits = self.model( -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: output = model(**micro_batch) -[default4]:[rank28]: output = model(**micro_batch) -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default7]:[rank31]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default5]:[rank29]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: sharded_logits = self.model( -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: output = model(**micro_batch) -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank29]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: output = self.o_proj(attention_output) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank28]: output = self.o_proj(attention_output) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: output = self.o_proj(attention_output) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default4]:[rank28]: return row_linear( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: out = F.linear(input, weight, bias) -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 259.94 MiB is free. Including non-PyTorch memory, this process has 79.06 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank24]: out = F.linear(input, weight, bias) -[default3]:[rank27]: return row_linear( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: out = F.linear(input, weight, bias) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 427.94 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank31]: output = self.o_proj(attention_output) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default2]:[rank26]: sharded_logits = self.model( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return row_linear( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: out = F.linear(input, weight, bias) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 427.94 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank29]: output = self.o_proj(attention_output) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank29]: return row_linear( -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: out = F.linear(input, weight, bias) -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 427.94 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank26]: output = self.o_proj(attention_output) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank26]: return row_linear( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank26]: out = F.linear(input, weight, bias) -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 259.94 MiB is free. Including non-PyTorch memory, this process has 79.06 GiB memory in use. Of the allocated memory 68.84 GiB is allocated by PyTorch, and 466.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -W0703 03:32:28.890000 140382654310208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 24678 closing signal SIGTERM -W0703 03:32:28.890000 140382654310208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 24682 closing signal SIGTERM -W0703 03:32:28.890000 140382654310208 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 24684 closing signal SIGTERM -W0703 03:32:28.891000 140146482988864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1165766 closing signal SIGTERM -W0703 03:32:28.891000 140146482988864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1165767 closing signal SIGTERM -W0703 03:32:28.892000 140146482988864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1165768 closing signal SIGTERM -W0703 03:32:28.892000 140146482988864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1165769 closing signal SIGTERM -W0703 03:32:28.892000 140146482988864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1165770 closing signal SIGTERM -W0703 03:32:28.892000 140146482988864 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1165772 closing signal SIGTERM -W0703 03:32:28.898000 139748038281024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3786044 closing signal SIGTERM -W0703 03:32:28.899000 139748038281024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3786045 closing signal SIGTERM -W0703 03:32:28.899000 139748038281024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3786047 closing signal SIGTERM -W0703 03:32:28.899000 139748038281024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3786048 closing signal SIGTERM -W0703 03:32:28.899000 139748038281024 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3786049 closing signal SIGTERM -W0703 03:32:28.900000 139885389195072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1441519 closing signal SIGTERM -W0703 03:32:28.900000 139885389195072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1441520 closing signal SIGTERM -W0703 03:32:28.901000 139885389195072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1441521 closing signal SIGTERM -W0703 03:32:28.901000 139885389195072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1441523 closing signal SIGTERM -W0703 03:32:28.901000 139885389195072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1441524 closing signal SIGTERM -W0703 03:32:28.901000 139885389195072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1441525 closing signal SIGTERM -E0703 03:32:29.633000 140382654310208 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 24677) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:32:28 - host : ip-26-0-166-125.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 24679) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:32:28 - host : ip-26-0-166-125.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 24680) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_03:32:28 - host : ip-26-0-166-125.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 24681) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_03:32:28 - host : ip-26-0-166-125.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 24683) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:32:28 - host : ip-26-0-166-125.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 24677) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-166-125: task 4: Exited with exit code 1 -E0703 03:32:30.237000 139748038281024 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3786043) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:32:28 - host : ip-26-0-171-102.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 3786046) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_03:32:28 - host : ip-26-0-171-102.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 3786050) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:32:28 - host : ip-26-0-171-102.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 3786043) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 03:32:30.339000 139885389195072 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1441518) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:32:28 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 1441522) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:32:28 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 1441518) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 03:32:30.535000 140146482988864 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 5 (pid: 1165771) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_03:32:28 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1165773) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_03:32:28 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1165771) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 0: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/status.txt b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-256/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/bench.slurm b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/bench.slurm deleted file mode 100644 index 00806be103ba0e3112c22921bb4bc2dd7c631b46..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32 llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32 --commit-message "Upload llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/config.yaml b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/config.yaml deleted file mode 100644 index 84b607090b49f4d67acfaa7a308d88b1e620a2b4..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 4 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 8 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 32 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/log.out b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/log.out deleted file mode 100644 index aa198b0fa07dbea0c7ffd177c2ed97661f75369a..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/log.out +++ /dev/null @@ -1,703 +0,0 @@ -======================== -START TIME: Tue Jul 2 23:56:51 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0702 23:56:57.446000 139649206945600 torch/distributed/run.py:757] -W0702 23:56:57.446000 139649206945600 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.446000 139649206945600 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:56:57.446000 139649206945600 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.467000 139979784869696 torch/distributed/run.py:757] -W0702 23:56:57.467000 139979784869696 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.467000 139979784869696 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:56:57.467000 139979784869696 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.484000 140387301316416 torch/distributed/run.py:757] -W0702 23:56:57.484000 140387301316416 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.484000 140387301316416 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:56:57.484000 140387301316416 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.496000 140130345482048 torch/distributed/run.py:757] -W0702 23:56:57.496000 140130345482048 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.496000 140130345482048 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:56:57.496000 140130345482048 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.662000 140085127972672 torch/distributed/run.py:757] -W0702 23:56:57.662000 140085127972672 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.662000 140085127972672 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:56:57.662000 140085127972672 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.776000 140201511925568 torch/distributed/run.py:757] -W0702 23:56:57.776000 140201511925568 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.776000 140201511925568 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:56:57.776000 140201511925568 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.776000 140270021408576 torch/distributed/run.py:757] -W0702 23:56:57.776000 140270021408576 torch/distributed/run.py:757] ***************************************** -W0702 23:56:57.776000 140270021408576 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:56:57.776000 140270021408576 torch/distributed/run.py:757] ***************************************** -W0702 23:56:58.226000 140333814708032 torch/distributed/run.py:757] -W0702 23:56:58.226000 140333814708032 torch/distributed/run.py:757] ***************************************** -W0702 23:56:58.226000 140333814708032 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0702 23:56:58.226000 140333814708032 torch/distributed/run.py:757] ***************************************** -[default0]:07/02/2024 23:57:22 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config: -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=4, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=16, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=32, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=8, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=0))], -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32')), -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None) -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config: -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu', -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True, -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50272) -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model.. -[default0]:07/02/2024 23:57:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks... -[default6]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default7]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default3]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2119.44MiB) -[default0]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided. -[default0]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator -[default2]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-192]: No checkpoint path provided. -[default6]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-192]: No checkpoint path provided. -[default4]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-192]: No checkpoint path provided. -[default5]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-192]: No checkpoint path provided. -[default7]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/02/2024 23:57:41 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-192]: No checkpoint path provided. -[default1]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=1|ip-26-0-163-226]: No checkpoint path provided. -[default4]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=4|ip-26-0-163-226]: No checkpoint path provided. -[default2]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=2|ip-26-0-163-226]: No checkpoint path provided. -[default7]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=7|ip-26-0-163-226]: No checkpoint path provided. -[default5]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=5|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=3|ip-26-0-163-226]: No checkpoint path provided. -[default0]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided. -[default6]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default0]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/02/2024 23:57:41 [INFO|DP=1|PP=0|TP=6|ip-26-0-163-226]: No checkpoint path provided. -[default3]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=3|ip-26-0-168-238]: No checkpoint path provided. -[default0]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=0|ip-26-0-168-238]: No checkpoint path provided. -[default2]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=2|ip-26-0-168-238]: No checkpoint path provided. -[default1]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=1|ip-26-0-168-238]: No checkpoint path provided. -[default6]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=6|ip-26-0-168-238]: No checkpoint path provided. -[default4]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=4|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=15|ip-26-0-169-86]: No checkpoint path provided. -[default0]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=8|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=14|ip-26-0-169-86]: No checkpoint path provided. -[default4]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=12|ip-26-0-169-86]: No checkpoint path provided. -[default2]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=10|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=13|ip-26-0-169-86]: No checkpoint path provided. -[default1]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=9|ip-26-0-169-86]: No checkpoint path provided. -[default5]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=5|ip-26-0-168-238]: No checkpoint path provided. -[default7]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=7|ip-26-0-168-238]: No checkpoint path provided. -[default3]:07/02/2024 23:57:41 [INFO|DP=2|PP=0|TP=11|ip-26-0-169-86]: No checkpoint path provided. -[default6]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=14|ip-26-0-172-73]: No checkpoint path provided. -[default0]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=8|ip-26-0-172-73]: No checkpoint path provided. -[default1]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=9|ip-26-0-172-73]: No checkpoint path provided. -[default4]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=12|ip-26-0-172-73]: No checkpoint path provided. -[default2]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=10|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=13|ip-26-0-172-73]: No checkpoint path provided. -[default3]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=11|ip-26-0-172-73]: No checkpoint path provided. -[default7]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=15|ip-26-0-172-73]: No checkpoint path provided. -[default5]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=5|ip-26-0-172-57]: No checkpoint path provided. -[default6]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=6|ip-26-0-172-57]: No checkpoint path provided. -[default1]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=1|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=0|ip-26-0-172-57]: No checkpoint path provided. -[default7]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=7|ip-26-0-172-57]: No checkpoint path provided. -[default2]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=2|ip-26-0-172-57]: No checkpoint path provided. -[default4]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=4|ip-26-0-172-57]: No checkpoint path provided. -[default3]:07/02/2024 23:57:41 [INFO|DP=3|PP=0|TP=3|ip-26-0-172-57]: No checkpoint path provided. -[default0]:07/02/2024 23:57:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/02/2024 23:57:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/02/2024 23:57:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/02/2024 23:57:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/02/2024 23:57:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 2 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/02/2024 23:57:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 3 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/02/2024 23:57:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/02/2024 23:57:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library -[default0]:07/02/2024 23:57:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/02/2024 23:57:44 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:57:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages -[default0]:07/02/2024 23:57:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1 -[default0]:07/02/2024 23:57:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: -[default0]:07/02/2024 23:57:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-02 23:57:46.832407 | mbs: 32 | grad_accum: 8 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/02/2024 23:57:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/02/2024 23:57:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 490.87MiB. Peak allocated 490.87MiB. Peak reserved: 512.00MiB -[default2]:07/02/2024 23:57:46 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:57:46 [WARNING|DP=3|PP=0|TP=2|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=14|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:57:46 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:57:46 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:57:46 [WARNING|DP=3|PP=0|TP=8|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=9|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=13|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:57:46 [WARNING|DP=3|PP=0|TP=10|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=12|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:57:46 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=7|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:57:46 [WARNING|DP=1|PP=0|TP=1|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=15|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=2|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=4|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=5|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:57:47 [WARNING|DP=2|PP=0|TP=1|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:57:47 [WARNING|DP=2|PP=0|TP=2|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:57:46 [WARNING|DP=2|PP=0|TP=0|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:57:46 [WARNING|DP=2|PP=0|TP=3|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:57:47 [WARNING|DP=2|PP=0|TP=6|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=3|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:57:46 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:57:46 [WARNING|DP=2|PP=0|TP=4|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:57:47 [WARNING|DP=2|PP=0|TP=8|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:57:47 [WARNING|DP=2|PP=0|TP=15|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:57:46 [WARNING|DP=3|PP=0|TP=5|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=1|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=0|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=7|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:57:46 [WARNING|DP=2|PP=0|TP=9|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:57:46 [WARNING|DP=2|PP=0|TP=7|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:57:46 [WARNING|DP=2|PP=0|TP=10|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=9|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:57:46 [WARNING|DP=2|PP=0|TP=5|ip-26-0-168-238]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:57:47 [WARNING|DP=2|PP=0|TP=14|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:57:47 [WARNING|DP=2|PP=0|TP=12|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:57:47 [WARNING|DP=2|PP=0|TP=13|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=3|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=4|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:57:47 [WARNING|DP=2|PP=0|TP=11|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/02/2024 23:57:46 [WARNING|DP=1|PP=0|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:57:47 [WARNING|DP=1|PP=0|TP=6|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/02/2024 23:57:47 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=11|ip-26-0-172-73]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/02/2024 23:57:47 [WARNING|DP=3|PP=0|TP=6|ip-26-0-172-57]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:07/02/2024 23:58:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 569.13MiB. Peak allocated 41921.44MiB. Peak reserved: 42770.00MiB -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:07/02/2024 23:58:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 19.1K | tokens_per_sec: 220K | tokens_per_sec_per_gpu: 3.44K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 0.0001 | model_tflops_per_gpu: 31.2 | hardware_tflops_per_gpu: 31.2 | grad_norm: 11.5 | cuda_memory_allocated: 736M | cuda_max_memory_reserved: 44.9G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/02/2024 23:58:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 999.71MiB. Peak reserved: 42846.00MiB -[default0]:07/02/2024 23:58:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42890.00MiB -[default0]:07/02/2024 23:58:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 5.26K | tokens_per_sec: 798K | tokens_per_sec_per_gpu: 12.5K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.53e-05 | model_tflops_per_gpu: 113 | hardware_tflops_per_gpu: 113 | grad_norm: 11.6 | cuda_memory_allocated: 736M | cuda_max_memory_reserved: 45G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/02/2024 23:58:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 999.72MiB. Peak reserved: 42890.00MiB -[default0]:07/02/2024 23:58:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:58:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 5.18K | tokens_per_sec: 810K | tokens_per_sec_per_gpu: 12.7K | global_batch_size: 1.02K | lm_loss: 11.9 | lr: 9.05e-05 | model_tflops_per_gpu: 115 | hardware_tflops_per_gpu: 115 | grad_norm: 122 | cuda_memory_allocated: 736M | cuda_max_memory_reserved: 45G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/02/2024 23:58:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 999.72MiB. Peak reserved: 42892.00MiB -[default0]:STAGE:2024-07-02 23:58:16 1104448:1104448 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/02/2024 23:58:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:58:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 5.17K | tokens_per_sec: 811K | tokens_per_sec_per_gpu: 12.7K | global_batch_size: 1.02K | lm_loss: 12.3 | lr: 8.58e-05 | model_tflops_per_gpu: 115 | hardware_tflops_per_gpu: 115 | grad_norm: 18.3 | cuda_memory_allocated: 736M | cuda_max_memory_reserved: 45G | hd_total_memory_tb: 312G | hd_used_memory_tb: 74G | hd_free_memory_tb: 238G -[default0]:07/02/2024 23:58:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 999.72MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:58:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 5.2K | tokens_per_sec: 806K | tokens_per_sec_per_gpu: 12.6K | global_batch_size: 1.02K | lm_loss: 11.2 | lr: 8.11e-05 | model_tflops_per_gpu: 114 | hardware_tflops_per_gpu: 114 | grad_norm: 29 -[default0]:07/02/2024 23:58:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:58:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 5.32K | tokens_per_sec: 788K | tokens_per_sec_per_gpu: 12.3K | global_batch_size: 1.02K | lm_loss: 10.2 | lr: 7.63e-05 | model_tflops_per_gpu: 112 | hardware_tflops_per_gpu: 112 | grad_norm: 10.4 -[default0]:STAGE:2024-07-02 23:58:36 1104448:1104448 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-02 23:58:37 1104448:1104448 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default0]:07/02/2024 23:59:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:59:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 7 / 20 | consumed_tokens: 29.4M | elapsed_time_per_iteration_ms: 5.09K | tokens_per_sec: 825K | tokens_per_sec_per_gpu: 12.9K | global_batch_size: 1.02K | lm_loss: 9.78 | lr: 7.16e-05 | model_tflops_per_gpu: 117 | hardware_tflops_per_gpu: 117 | grad_norm: 9.29 -[default0]:07/02/2024 23:59:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:59:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 8 / 20 | consumed_tokens: 33.6M | elapsed_time_per_iteration_ms: 5.07K | tokens_per_sec: 827K | tokens_per_sec_per_gpu: 12.9K | global_batch_size: 1.02K | lm_loss: 10 | lr: 6.68e-05 | model_tflops_per_gpu: 117 | hardware_tflops_per_gpu: 117 | grad_norm: 32.7 -[default0]:07/02/2024 23:59:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:59:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 9 / 20 | consumed_tokens: 37.7M | elapsed_time_per_iteration_ms: 5.08K | tokens_per_sec: 825K | tokens_per_sec_per_gpu: 12.9K | global_batch_size: 1.02K | lm_loss: 9.28 | lr: 6.21e-05 | model_tflops_per_gpu: 117 | hardware_tflops_per_gpu: 117 | grad_norm: 9.83 -[default0]:07/02/2024 23:59:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:59:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 10 / 20 | consumed_tokens: 41.9M | elapsed_time_per_iteration_ms: 5.06K | tokens_per_sec: 829K | tokens_per_sec_per_gpu: 12.9K | global_batch_size: 1.02K | lm_loss: 9.05 | lr: 5.74e-05 | model_tflops_per_gpu: 117 | hardware_tflops_per_gpu: 117 | grad_norm: 7.95 -[default0]:07/02/2024 23:59:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:59:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 11 / 20 | consumed_tokens: 46.1M | elapsed_time_per_iteration_ms: 5.05K | tokens_per_sec: 830K | tokens_per_sec_per_gpu: 13K | global_batch_size: 1.02K | lm_loss: 8.8 | lr: 5.26e-05 | model_tflops_per_gpu: 118 | hardware_tflops_per_gpu: 118 | grad_norm: 6.87 -[default0]:07/02/2024 23:59:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:59:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 12 / 20 | consumed_tokens: 50.3M | elapsed_time_per_iteration_ms: 5.06K | tokens_per_sec: 828K | tokens_per_sec_per_gpu: 12.9K | global_batch_size: 1.02K | lm_loss: 8.53 | lr: 4.79e-05 | model_tflops_per_gpu: 117 | hardware_tflops_per_gpu: 117 | grad_norm: 6.54 -[default0]:07/02/2024 23:59:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:59:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 13 / 20 | consumed_tokens: 54.5M | elapsed_time_per_iteration_ms: 5.06K | tokens_per_sec: 829K | tokens_per_sec_per_gpu: 13K | global_batch_size: 1.02K | lm_loss: 8.26 | lr: 4.32e-05 | model_tflops_per_gpu: 118 | hardware_tflops_per_gpu: 118 | grad_norm: 6.14 -[default0]:07/02/2024 23:59:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/02/2024 23:59:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 14 / 20 | consumed_tokens: 58.7M | elapsed_time_per_iteration_ms: 5.07K | tokens_per_sec: 828K | tokens_per_sec_per_gpu: 12.9K | global_batch_size: 1.02K | lm_loss: 8.04 | lr: 3.84e-05 | model_tflops_per_gpu: 117 | hardware_tflops_per_gpu: 117 | grad_norm: 5.51 -[default0]:07/02/2024 23:59:55 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/03/2024 00:00:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 15 / 20 | consumed_tokens: 62.9M | elapsed_time_per_iteration_ms: 5.07K | tokens_per_sec: 828K | tokens_per_sec_per_gpu: 12.9K | global_batch_size: 1.02K | lm_loss: 7.94 | lr: 3.37e-05 | model_tflops_per_gpu: 117 | hardware_tflops_per_gpu: 117 | grad_norm: 6.08 -[default0]:07/03/2024 00:00:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/03/2024 00:00:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 16 / 20 | consumed_tokens: 67.1M | elapsed_time_per_iteration_ms: 5.05K | tokens_per_sec: 830K | tokens_per_sec_per_gpu: 13K | global_batch_size: 1.02K | lm_loss: 7.88 | lr: 2.89e-05 | model_tflops_per_gpu: 118 | hardware_tflops_per_gpu: 118 | grad_norm: 6.9 -[default0]:07/03/2024 00:00:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/03/2024 00:00:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 17 / 20 | consumed_tokens: 71.3M | elapsed_time_per_iteration_ms: 5.05K | tokens_per_sec: 831K | tokens_per_sec_per_gpu: 13K | global_batch_size: 1.02K | lm_loss: 7.74 | lr: 2.42e-05 | model_tflops_per_gpu: 118 | hardware_tflops_per_gpu: 118 | grad_norm: 5.79 -[default0]:07/03/2024 00:00:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/03/2024 00:00:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 18 / 20 | consumed_tokens: 75.5M | elapsed_time_per_iteration_ms: 5.05K | tokens_per_sec: 831K | tokens_per_sec_per_gpu: 13K | global_batch_size: 1.02K | lm_loss: 7.61 | lr: 1.95e-05 | model_tflops_per_gpu: 118 | hardware_tflops_per_gpu: 118 | grad_norm: 4.59 -[default0]:07/03/2024 00:00:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/03/2024 00:00:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 19 / 20 | consumed_tokens: 79.7M | elapsed_time_per_iteration_ms: 5.06K | tokens_per_sec: 830K | tokens_per_sec_per_gpu: 13K | global_batch_size: 1.02K | lm_loss: 7.52 | lr: 1.47e-05 | model_tflops_per_gpu: 118 | hardware_tflops_per_gpu: 118 | grad_norm: 4.5 -[default0]:07/03/2024 00:00:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 701.67MiB. Peak allocated 42053.98MiB. Peak reserved: 42892.00MiB -[default0]:07/03/2024 00:00:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration: 20 / 20 | consumed_tokens: 83.9M | elapsed_time_per_iteration_ms: 5.04K | tokens_per_sec: 832K | tokens_per_sec_per_gpu: 13K | global_batch_size: 1.02K | lm_loss: 7.46 | lr: 1e-05 | model_tflops_per_gpu: 118 | hardware_tflops_per_gpu: 118 | grad_norm: 4.57 -Saved 1 csv files over 1 completed logs -Processing file: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/profiler/ip-26-0-160-192_1104448.1719964747268413011.pt.trace.json -Results written to /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-32/profiler.csv -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-160-192_1104448.1719964747268413011.pt.trace.json: 0%| | 0.00/1.17G [00:00 $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4 llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4 --commit-message "Upload llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/config.yaml b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/config.yaml deleted file mode 100644 index 16b4e74fd18b10c8aee142eeb4572c730eb6d484..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 4 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 64 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 4 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/log.out b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/log.out deleted file mode 100644 index 62d9665332bed6b4c83c931af5a994895bd515c1..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/log.out +++ /dev/null @@ -1,707 +0,0 @@ -======================== -START TIME: Wed Jul 3 10:28:41 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 10:28:47.279000 140020092532544 torch/distributed/run.py:757] -W0703 10:28:47.279000 140020092532544 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.279000 140020092532544 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:28:47.279000 140020092532544 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.285000 140418310711104 torch/distributed/run.py:757] -W0703 10:28:47.285000 140418310711104 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.285000 140418310711104 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:28:47.285000 140418310711104 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.288000 139946479441728 torch/distributed/run.py:757] -W0703 10:28:47.288000 139946479441728 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.288000 139946479441728 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:28:47.288000 139946479441728 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.343000 139914602121024 torch/distributed/run.py:757] -W0703 10:28:47.343000 139914602121024 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.343000 139914602121024 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:28:47.343000 139914602121024 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.569000 139781451552576 torch/distributed/run.py:757] -W0703 10:28:47.569000 139781451552576 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.569000 139781451552576 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:28:47.569000 139781451552576 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.674000 140528482002752 torch/distributed/run.py:757] -W0703 10:28:47.674000 140528482002752 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.674000 140528482002752 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:28:47.674000 140528482002752 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.707000 140004216411968 torch/distributed/run.py:757] -W0703 10:28:47.707000 140004216411968 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.707000 140004216411968 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:28:47.707000 140004216411968 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.845000 140383048894272 torch/distributed/run.py:757] -W0703 10:28:47.845000 140383048894272 torch/distributed/run.py:757] ***************************************** -W0703 10:28:47.845000 140383048894272 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 10:28:47.845000 140383048894272 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 10:29:12 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Config: -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: run='%date_%jobid', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: seed=42, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: step=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: consumed_train_samples=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: benchmark_csv_path=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: ignore_sanity_checks=True), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: parallelism=ParallelismArgs(dp=4, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pp=1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tp=16, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pp_engine=, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tp_mode=, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tp_linear_async_communication=False, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: expert_parallel_size=1), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: eos_token_id=2, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hidden_act='silu', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hidden_size=2048, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: initializer_range=0.02, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: intermediate_size=4096, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: is_llama_config=True, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: max_position_embeddings=4096, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_attention_heads=32, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_hidden_layers=24, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_key_value_heads=32, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pad_token_id=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pretraining_tp=1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rms_norm_eps=1e-05, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rope_scaling=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rope_theta=10000.0, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tie_word_embeddings=True, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: use_cache=True, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: vocab_size=50272), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: dtype=torch.bfloat16, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tokenizer_revision=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tokenizer_max_length=None), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: checkpoint_interval=100000, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: save_initial_state=False, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: resume_checkpoint_path=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: log_level_replica='info', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration_step_info_interval=1), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: train_steps=20, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: micro_batch_size=4, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: batch_accumulation_per_replica=64, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: val_check_interval=-1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: limit_val_batches=0, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: limit_test_batches=0), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: adam_beta1=0.9, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: adam_beta2=0.95, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: torch_adam_is_fused=True, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: name='adamW'), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: zero_stage=1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: weight_decay=0.01, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: clip_grad=1.0, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_warmup_steps=1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_warmup_style='linear', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_decay_style='linear', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_decay_steps=19, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lr_decay_starting_step=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: min_decay_lr=1e-05)), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: start_training_step=1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hf_dataset_splits='train', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hf_dataset_config_name=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: dataset_overwrite_cache=False, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: text_column_name='text'), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: seed=42, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_loading_workers=0))], -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4')), -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: lighteval=None) -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Model Config: -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: eos_token_id=2, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hidden_act='silu', -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: hidden_size=2048, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: initializer_range=0.02, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: intermediate_size=4096, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: is_llama_config=True, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: max_position_embeddings=4096, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_attention_heads=32, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_hidden_layers=24, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: num_key_value_heads=32, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pad_token_id=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: pretraining_tp=1, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rms_norm_eps=1e-05, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rope_scaling=None, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: rope_theta=10000.0, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: tie_word_embeddings=True, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: use_cache=True, -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: vocab_size=50272) -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Building model.. -[default0]:07/03/2024 10:29:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Setting PP block ranks... -[default3]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-178]: No checkpoint path provided. -[default5]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-178]: No checkpoint path provided. -[default0]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-178]: No checkpoint path provided. -[default4]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-178]: No checkpoint path provided. -[default6]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-178]: No checkpoint path provided. -[default1]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-178]: No checkpoint path provided. -[default2]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-178]: No checkpoint path provided. -[default7]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-178]: No checkpoint path provided. -[default7]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-153]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-153]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=7|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-153]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-153]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=5|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-153]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-153]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=6|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Total number of parameters: 1.11G (2119.44MiB) -[default0]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Parametrizing model parameters using StandardParametrizator -[default1]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-153]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-153]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-153]: No checkpoint path provided. -[default2]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-153]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-153]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-153]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-153]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-153]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-153]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=4|ip-26-0-161-153]: No checkpoint path provided. -[default3]:07/03/2024 10:29:29 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default2]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=14|ip-26-0-173-202]: No checkpoint path provided. -[default0]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=8|ip-26-0-173-202]: No checkpoint path provided. -[default3]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=11|ip-26-0-173-202]: No checkpoint path provided. -[default3]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=12|ip-26-0-173-202]: No checkpoint path provided. -[default6]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=1|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=0|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=4|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=5|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=2|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=10|ip-26-0-163-43]: No checkpoint path provided. -[default0]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=8|ip-26-0-163-43]: No checkpoint path provided. -[default7]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=15|ip-26-0-173-202]: No checkpoint path provided. -[default5]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=13|ip-26-0-173-202]: No checkpoint path provided. -[default2]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=10|ip-26-0-173-202]: No checkpoint path provided. -[default1]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=9|ip-26-0-173-202]: No checkpoint path provided. -[default6]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=14|ip-26-0-163-43]: No checkpoint path provided. -[default3]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=3|ip-26-0-171-102]: No checkpoint path provided. -[default3]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=11|ip-26-0-163-43]: No checkpoint path provided. -[default1]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=9|ip-26-0-163-43]: No checkpoint path provided. -[default6]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=6|ip-26-0-171-102]: No checkpoint path provided. -[default7]:07/03/2024 10:29:29 [INFO|DP=2|PP=0|TP=7|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=12|ip-26-0-163-43]: No checkpoint path provided. -[default5]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=13|ip-26-0-163-43]: No checkpoint path provided. -[default7]:07/03/2024 10:29:29 [INFO|DP=1|PP=0|TP=15|ip-26-0-163-43]: No checkpoint path provided. -[default2]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=2|ip-26-0-173-7]: No checkpoint path provided. -[default7]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=7|ip-26-0-173-7]: No checkpoint path provided. -[default3]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=3|ip-26-0-173-7]: No checkpoint path provided. -[default3]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=11|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=8|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=14|ip-26-0-174-36]: No checkpoint path provided. -[default1]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=9|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=13|ip-26-0-174-36]: No checkpoint path provided. -[default2]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=10|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=0|ip-26-0-173-7]: No checkpoint path provided. -[default4]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=4|ip-26-0-173-7]: No checkpoint path provided. -[default1]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=1|ip-26-0-173-7]: No checkpoint path provided. -[default4]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=12|ip-26-0-174-36]: No checkpoint path provided. -[default6]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=6|ip-26-0-173-7]: No checkpoint path provided. -[default5]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=5|ip-26-0-173-7]: No checkpoint path provided. -[default7]:07/03/2024 10:29:30 [INFO|DP=3|PP=0|TP=15|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 10:29:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 10:29:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 10:29:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [ZeRO sharding] DP Rank 0 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 10:29:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [ZeRO sharding] DP Rank 1 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 10:29:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [ZeRO sharding] DP Rank 2 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 10:29:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [ZeRO sharding] DP Rank 3 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 10:29:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 10:29:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Using `datasets` library -[default0]:07/03/2024 10:29:33 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 10:29:33 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:29:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 10:29:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 10:29:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: -[default0]:07/03/2024 10:29:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: [Start training] datetime: 2024-07-03 10:29:35.711845 | mbs: 4 | grad_accum: 64 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 10:29:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 10:29:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 490.87MiB. Peak allocated 490.87MiB. Peak reserved: 512.00MiB -[default5]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=13|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=14|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=8|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=11|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=12|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=5|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=7|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=6|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=0|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=4|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=10|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=1|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=2|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=2|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=5|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=7|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=3|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=8|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=2|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=4|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=3|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:29:35 [WARNING|DP=0|PP=0|TP=1|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=15|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=13|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=14|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=3|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=11|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=11|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=8|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=9|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=9|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=9|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=0|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=1|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=13|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=4|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=10|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=7|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=12|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:29:35 [WARNING|DP=2|PP=0|TP=6|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=12|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=6|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=5|ip-26-0-173-7]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:29:35 [WARNING|DP=3|PP=0|TP=15|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 10:29:35 [WARNING|DP=1|PP=0|TP=15|ip-26-0-163-43]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 10:29:36 [WARNING|DP=2|PP=0|TP=10|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 10:29:36 [WARNING|DP=3|PP=0|TP=14|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:07/03/2024 10:29:46 [WARNING|DP=2|PP=0|TP=2|ip-26-0-171-102]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default2]:07/03/2024 10:29:46 [WARNING|DP=2|PP=0|TP=2|ip-26-0-171-102]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default2]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default2]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:07/03/2024 10:29:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 567.19MiB. Peak allocated 5736.28MiB. Peak reserved: 6118.00MiB -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:07/03/2024 10:30:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 27.1K | tokens_per_sec: 155K | tokens_per_sec_per_gpu: 2.42K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 0.0001 | model_tflops_per_gpu: 22 | hardware_tflops_per_gpu: 22 | grad_norm: 11.5 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 6.5G | hd_total_memory_tb: 312G | hd_used_memory_tb: 69.3G | hd_free_memory_tb: 243G -[default0]:07/03/2024 10:30:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 997.77MiB. Peak reserved: 6200.00MiB -[default0]:07/03/2024 10:30:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.75MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:30:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 10.1K | tokens_per_sec: 414K | tokens_per_sec_per_gpu: 6.47K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.53e-05 | model_tflops_per_gpu: 58.7 | hardware_tflops_per_gpu: 58.7 | grad_norm: 11.6 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 6.54G | hd_total_memory_tb: 312G | hd_used_memory_tb: 69.3G | hd_free_memory_tb: 243G -[default0]:07/03/2024 10:30:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 997.80MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:30:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.75MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:30:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 10.3K | tokens_per_sec: 406K | tokens_per_sec_per_gpu: 6.35K | global_batch_size: 1.02K | lm_loss: 11.9 | lr: 9.05e-05 | model_tflops_per_gpu: 57.6 | hardware_tflops_per_gpu: 57.6 | grad_norm: 122 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 6.54G | hd_total_memory_tb: 312G | hd_used_memory_tb: 69.3G | hd_free_memory_tb: 243G -[default0]:07/03/2024 10:30:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 997.80MiB. Peak reserved: 6238.00MiB -[default0]:STAGE:2024-07-03 10:30:23 1502820:1502820 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 10:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.75MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 13.6K | tokens_per_sec: 308K | tokens_per_sec_per_gpu: 4.81K | global_batch_size: 1.02K | lm_loss: 12.3 | lr: 8.58e-05 | model_tflops_per_gpu: 43.7 | hardware_tflops_per_gpu: 43.7 | grad_norm: 18.3 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 6.54G | hd_total_memory_tb: 312G | hd_used_memory_tb: 69.3G | hd_free_memory_tb: 243G -[default0]:07/03/2024 10:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 997.80MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:30:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 13.7K | tokens_per_sec: 306K | tokens_per_sec_per_gpu: 4.78K | global_batch_size: 1.02K | lm_loss: 11.2 | lr: 8.11e-05 | model_tflops_per_gpu: 43.4 | hardware_tflops_per_gpu: 43.4 | grad_norm: 29 -[default0]:07/03/2024 10:30:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:31:04 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 13.7K | tokens_per_sec: 305K | tokens_per_sec_per_gpu: 4.77K | global_batch_size: 1.02K | lm_loss: 10.2 | lr: 7.63e-05 | model_tflops_per_gpu: 43.3 | hardware_tflops_per_gpu: 43.3 | grad_norm: 10.4 -[default0]:STAGE:2024-07-03 10:31:39 1502820:1502820 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 10:31:43 1502820:1502820 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default0]:07/03/2024 10:36:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:36:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 7 / 20 | consumed_tokens: 29.4M | elapsed_time_per_iteration_ms: 9.94K | tokens_per_sec: 422K | tokens_per_sec_per_gpu: 6.59K | global_batch_size: 1.02K | lm_loss: 9.78 | lr: 7.16e-05 | model_tflops_per_gpu: 59.8 | hardware_tflops_per_gpu: 59.8 | grad_norm: 9.29 -[default0]:07/03/2024 10:36:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:36:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 8 / 20 | consumed_tokens: 33.6M | elapsed_time_per_iteration_ms: 10.1K | tokens_per_sec: 417K | tokens_per_sec_per_gpu: 6.51K | global_batch_size: 1.02K | lm_loss: 10 | lr: 6.68e-05 | model_tflops_per_gpu: 59.1 | hardware_tflops_per_gpu: 59.1 | grad_norm: 32.7 -[default0]:07/03/2024 10:36:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:36:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 9 / 20 | consumed_tokens: 37.7M | elapsed_time_per_iteration_ms: 10.3K | tokens_per_sec: 407K | tokens_per_sec_per_gpu: 6.36K | global_batch_size: 1.02K | lm_loss: 9.28 | lr: 6.21e-05 | model_tflops_per_gpu: 57.7 | hardware_tflops_per_gpu: 57.7 | grad_norm: 9.81 -[default0]:07/03/2024 10:36:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:37:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 10 / 20 | consumed_tokens: 41.9M | elapsed_time_per_iteration_ms: 9.9K | tokens_per_sec: 424K | tokens_per_sec_per_gpu: 6.62K | global_batch_size: 1.02K | lm_loss: 9.05 | lr: 5.74e-05 | model_tflops_per_gpu: 60.1 | hardware_tflops_per_gpu: 60.1 | grad_norm: 7.94 -[default0]:07/03/2024 10:37:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:37:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 11 / 20 | consumed_tokens: 46.1M | elapsed_time_per_iteration_ms: 9.86K | tokens_per_sec: 425K | tokens_per_sec_per_gpu: 6.64K | global_batch_size: 1.02K | lm_loss: 8.8 | lr: 5.26e-05 | model_tflops_per_gpu: 60.3 | hardware_tflops_per_gpu: 60.3 | grad_norm: 6.87 -[default0]:07/03/2024 10:37:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:37:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 12 / 20 | consumed_tokens: 50.3M | elapsed_time_per_iteration_ms: 10.1K | tokens_per_sec: 413K | tokens_per_sec_per_gpu: 6.46K | global_batch_size: 1.02K | lm_loss: 8.53 | lr: 4.79e-05 | model_tflops_per_gpu: 58.6 | hardware_tflops_per_gpu: 58.6 | grad_norm: 6.54 -[default0]:07/03/2024 10:37:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:37:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 13 / 20 | consumed_tokens: 54.5M | elapsed_time_per_iteration_ms: 10.3K | tokens_per_sec: 408K | tokens_per_sec_per_gpu: 6.38K | global_batch_size: 1.02K | lm_loss: 8.26 | lr: 4.32e-05 | model_tflops_per_gpu: 57.9 | hardware_tflops_per_gpu: 57.9 | grad_norm: 6.14 -[default0]:07/03/2024 10:37:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:37:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 14 / 20 | consumed_tokens: 58.7M | elapsed_time_per_iteration_ms: 9.9K | tokens_per_sec: 424K | tokens_per_sec_per_gpu: 6.62K | global_batch_size: 1.02K | lm_loss: 8.04 | lr: 3.84e-05 | model_tflops_per_gpu: 60 | hardware_tflops_per_gpu: 60 | grad_norm: 5.51 -[default0]:07/03/2024 10:37:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:37:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 15 / 20 | consumed_tokens: 62.9M | elapsed_time_per_iteration_ms: 9.96K | tokens_per_sec: 421K | tokens_per_sec_per_gpu: 6.58K | global_batch_size: 1.02K | lm_loss: 7.94 | lr: 3.37e-05 | model_tflops_per_gpu: 59.7 | hardware_tflops_per_gpu: 59.7 | grad_norm: 6.08 -[default0]:07/03/2024 10:37:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:38:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 16 / 20 | consumed_tokens: 67.1M | elapsed_time_per_iteration_ms: 9.92K | tokens_per_sec: 423K | tokens_per_sec_per_gpu: 6.6K | global_batch_size: 1.02K | lm_loss: 7.88 | lr: 2.89e-05 | model_tflops_per_gpu: 59.9 | hardware_tflops_per_gpu: 59.9 | grad_norm: 6.9 -[default0]:07/03/2024 10:38:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:38:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 17 / 20 | consumed_tokens: 71.3M | elapsed_time_per_iteration_ms: 9.86K | tokens_per_sec: 425K | tokens_per_sec_per_gpu: 6.64K | global_batch_size: 1.02K | lm_loss: 7.74 | lr: 2.42e-05 | model_tflops_per_gpu: 60.3 | hardware_tflops_per_gpu: 60.3 | grad_norm: 5.78 -[default0]:07/03/2024 10:38:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:38:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 18 / 20 | consumed_tokens: 75.5M | elapsed_time_per_iteration_ms: 9.9K | tokens_per_sec: 424K | tokens_per_sec_per_gpu: 6.62K | global_batch_size: 1.02K | lm_loss: 7.61 | lr: 1.95e-05 | model_tflops_per_gpu: 60.1 | hardware_tflops_per_gpu: 60.1 | grad_norm: 4.59 -[default0]:07/03/2024 10:38:28 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:38:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 19 / 20 | consumed_tokens: 79.7M | elapsed_time_per_iteration_ms: 9.94K | tokens_per_sec: 422K | tokens_per_sec_per_gpu: 6.6K | global_batch_size: 1.02K | lm_loss: 7.52 | lr: 1.47e-05 | model_tflops_per_gpu: 59.8 | hardware_tflops_per_gpu: 59.8 | grad_norm: 4.5 -[default0]:07/03/2024 10:38:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: Memory usage: 699.72MiB. Peak allocated 5868.85MiB. Peak reserved: 6238.00MiB -[default0]:07/03/2024 10:38:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-153]: iteration: 20 / 20 | consumed_tokens: 83.9M | elapsed_time_per_iteration_ms: 11K | tokens_per_sec: 380K | tokens_per_sec_per_gpu: 5.94K | global_batch_size: 1.02K | lm_loss: 7.46 | lr: 1e-05 | model_tflops_per_gpu: 53.9 | hardware_tflops_per_gpu: 53.9 | grad_norm: 4.56 -Saved 1 csv files over 1 completed logs -Processing file: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/profiler/ip-26-0-161-153_1502820.1720002925641726521.pt.trace.json -Results written to /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-4/profiler.csv -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-161-153_1502820.1720002925641726521.pt.trace.json: 0%| | 0.00/9.02G [00:00 $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64 llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64 --commit-message "Upload llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/config.yaml b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/config.yaml deleted file mode 100644 index c6fdd842138896b51dd59fea08e7c5ea57dfe40c..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 4 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 4 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 64 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/log.out b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/log.out deleted file mode 100644 index 96f2fab4a62215de6de8f5fc158937430ab6c27f..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/log.out +++ /dev/null @@ -1,4021 +0,0 @@ -======================== -START TIME: Wed Jul 3 01:00:24 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 01:00:27.238000 140015942473536 torch/distributed/run.py:757] -W0703 01:00:27.238000 140015942473536 torch/distributed/run.py:757] ***************************************** -W0703 01:00:27.238000 140015942473536 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:00:27.238000 140015942473536 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.222000 140709557507904 torch/distributed/run.py:757] -W0703 01:00:29.222000 140709557507904 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.222000 140709557507904 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:00:29.222000 140709557507904 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.222000 139854410950464 torch/distributed/run.py:757] -W0703 01:00:29.222000 139854410950464 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.222000 139854410950464 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:00:29.222000 139854410950464 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.233000 140540211869504 torch/distributed/run.py:757] -W0703 01:00:29.233000 140540211869504 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.233000 140540211869504 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:00:29.233000 140540211869504 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.237000 140092964255552 torch/distributed/run.py:757] -W0703 01:00:29.237000 140092964255552 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.237000 140092964255552 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:00:29.237000 140092964255552 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.342000 140574819604288 torch/distributed/run.py:757] -W0703 01:00:29.342000 140574819604288 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.342000 140574819604288 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:00:29.342000 140574819604288 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.369000 140324900312896 torch/distributed/run.py:757] -W0703 01:00:29.369000 140324900312896 torch/distributed/run.py:757] ***************************************** -W0703 01:00:29.369000 140324900312896 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:00:29.369000 140324900312896 torch/distributed/run.py:757] ***************************************** -W0703 01:00:30.315000 140432057993024 torch/distributed/run.py:757] -W0703 01:00:30.315000 140432057993024 torch/distributed/run.py:757] ***************************************** -W0703 01:00:30.315000 140432057993024 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 01:00:30.315000 140432057993024 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 01:00:54 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config: -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=4, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=16, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=64, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=4, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))], -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64')), -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None) -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config: -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu', -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True, -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50272) -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model.. -[default0]:07/03/2024 01:00:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks... -[default5]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=13|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=9|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-153]: No checkpoint path provided. -[default5]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=5|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=4|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=6|ip-26-0-161-153]: No checkpoint path provided. -[default0]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=8|ip-26-0-161-78]: No checkpoint path provided. -[default1]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-153]: No checkpoint path provided. -[default6]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=14|ip-26-0-161-78]: No checkpoint path provided. -[default7]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=7|ip-26-0-161-153]: No checkpoint path provided. -[default4]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=12|ip-26-0-161-78]: No checkpoint path provided. -[default3]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=11|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=10|ip-26-0-161-78]: No checkpoint path provided. -[default2]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-153]: No checkpoint path provided. -[default7]:07/03/2024 01:01:11 [INFO|DP=1|PP=0|TP=15|ip-26-0-161-78]: No checkpoint path provided. -[default4]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=12|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=8|ip-26-0-171-88]: No checkpoint path provided. -[default0]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=0|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=10|ip-26-0-171-88]: No checkpoint path provided. -[default3]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=11|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=9|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=14|ip-26-0-171-88]: No checkpoint path provided. -[default1]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=1|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=13|ip-26-0-171-88]: No checkpoint path provided. -[default7]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=15|ip-26-0-171-88]: No checkpoint path provided. -[default6]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=6|ip-26-0-171-62]: No checkpoint path provided. -[default5]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=5|ip-26-0-171-62]: No checkpoint path provided. -[default2]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=2|ip-26-0-171-62]: No checkpoint path provided. -[default7]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=7|ip-26-0-171-62]: No checkpoint path provided. -[default3]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=3|ip-26-0-171-62]: No checkpoint path provided. -[default4]:07/03/2024 01:01:11 [INFO|DP=3|PP=0|TP=4|ip-26-0-171-62]: No checkpoint path provided. -[default1]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=15|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=10|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=9|ip-26-0-171-102]: No checkpoint path provided. -[default0]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=8|ip-26-0-171-102]: No checkpoint path provided. -[default2]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=11|ip-26-0-171-102]: No checkpoint path provided. -[default6]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=14|ip-26-0-171-102]: No checkpoint path provided. -[default4]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=12|ip-26-0-171-102]: No checkpoint path provided. -[default5]:07/03/2024 01:01:11 [INFO|DP=2|PP=0|TP=13|ip-26-0-171-102]: No checkpoint path provided. -[default1]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=9|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=13|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=10|ip-26-0-161-103]: No checkpoint path provided. -[default3]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=11|ip-26-0-161-103]: No checkpoint path provided. -[default0]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=8|ip-26-0-161-103]: No checkpoint path provided. -[default4]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=12|ip-26-0-161-103]: No checkpoint path provided. -[default5]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=5|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=7|ip-26-0-160-225]: No checkpoint path provided. -[default6]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=6|ip-26-0-160-225]: No checkpoint path provided. -[default3]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.11G (2119.44MiB) -[default0]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator -[default3]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided. -[default7]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=15|ip-26-0-161-103]: No checkpoint path provided. -[default6]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=14|ip-26-0-161-103]: No checkpoint path provided. -[default2]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided. -[default4]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=4|ip-26-0-160-225]: No checkpoint path provided. -[default1]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 01:01:12 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided. -[default0]:07/03/2024 01:01:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 01:01:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 01:01:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 01:01:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 01:01:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 2 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 01:01:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 3 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 01:01:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 01:01:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library -[default0]:07/03/2024 01:01:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:07/03/2024 01:01:15 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:01:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 01:01:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 01:01:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: -[default0]:07/03/2024 01:01:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 01:01:17.289255 | mbs: 64 | grad_accum: 4 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 01:01:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 01:01:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 490.87MiB. Peak allocated 490.87MiB. Peak reserved: 512.00MiB -[default5]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=13|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=14|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=8|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=15|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=9|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=8|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=15|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=8|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=14|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=11|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=11|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=9|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=5|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=7|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=15|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=12|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=15|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=6|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=4|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=9|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=6|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=5|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=4|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=7|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=10|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=10|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-153]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=12|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=13|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=11|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=10|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=9|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=10|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=12|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=14|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=6|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=14|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=13|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=13|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=2|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=5|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=7|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=4|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:01:17 [WARNING|DP=2|PP=0|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 01:01:17 [WARNING|DP=1|PP=0|TP=12|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 01:01:17 [WARNING|DP=0|PP=0|TP=8|ip-26-0-161-103]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:01:17 [WARNING|DP=3|PP=0|TP=3|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 01:01:18 [WARNING|DP=1|PP=0|TP=11|ip-26-0-161-78]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:[rank34]: Traceback (most recent call last): -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: trainer.train(dataloader) -[default3]:[rank35]: Traceback (most recent call last): -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank35]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank35]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: output = model(**micro_batch) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank34]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank34]: output = model(**micro_batch) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: Traceback (most recent call last): -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank36]: trainer.train(dataloader) -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: Traceback (most recent call last): -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default6]:[rank38]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank35]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank38]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: Traceback (most recent call last): -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank38]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank36]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: trainer.train(dataloader) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: trainer.train(dataloader) -[default5]:[rank37]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: sharded_logits = self.model( -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default4]:[rank36]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank37]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: sharded_logits = self.model( -[default5]:[rank37]: output = model(**micro_batch) -[default0]:[rank32]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: output = model(**micro_batch) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank35]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank32]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: sharded_logits = self.model( -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: sharded_logits = self.model( -[default0]:[rank32]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default6]:[rank38]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default0]:[rank32]: output = model(**micro_batch) -[default3]:[rank35]: output = self.pp_block(**new_kwargs) -[default2]:[rank34]: output = self.pp_block(**new_kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: sharded_logits = self.model( -[default5]:[rank37]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank38]: output = self.pp_block(**new_kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank35]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default1]:[rank33]: Traceback (most recent call last): -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank32]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank38]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank35]: return self._call_impl(*args, **kwargs) -[default3]:[rank35]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank35]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: trainer.train(dataloader) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank37]: output = self.pp_block(**new_kwargs) -[default7]:[rank39]: Traceback (most recent call last): -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank32]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank39]: trainer.train(dataloader) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return self._call_impl(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: Traceback (most recent call last): -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank38]: return forward_call(*args, **kwargs) -[default6]:[rank38]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank40]: Traceback (most recent call last): -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank41]: trainer.train(dataloader) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank41]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank40]: trainer.train(dataloader) -[default3]:[rank35]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward -[default3]:[rank35]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank40]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank34]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank34]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default3]:[rank35]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 37.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.45 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank38]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default1]:[rank41]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank33]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank39]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank41]: output = model(**micro_batch) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank34]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: output = self.pp_block(**new_kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank38]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 77.94 MiB is free. Including non-PyTorch memory, this process has 79.24 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default0]:[rank40]: output = model(**micro_batch) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank34]: return forward_call(*args, **kwargs) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank37]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: output = self.pp_block(**new_kwargs) -[default2]:[rank34]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank37]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank37]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: sharded_logits = self.model( -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default0]:[rank40]: sharded_logits = self.model( -[default7]:[rank39]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: output = model(**micro_batch) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default5]:[rank37]: return forward_call(*args, **kwargs) -[default5]:[rank37]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default2]:[rank34]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default7]:[rank39]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank41]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank37]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 37.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.45 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank41]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: output = model(**micro_batch) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank32]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank41]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank32]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank40]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: sharded_logits = self.model( -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank34]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 128.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: sharded_logits = self.model( -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank32]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank36]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank41]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: return self._call_impl(*args, **kwargs) -[default0]:[rank32]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: return self._call_impl(*args, **kwargs) -[default4]:[rank36]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank41]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank40]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank41]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default4]:[rank36]: return forward_call(*args, **kwargs) -[default1]:[rank41]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default4]:[rank36]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank39]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank41]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank36]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank40]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank36]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 128.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default0]:[rank40]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank40]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default0]:[rank40]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default0]:[rank40]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank39]: output = self.pp_block(**new_kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank33]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: Traceback (most recent call last): -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank44]: trainer.train(dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank44]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank44]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank44]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank44]: output = model(**micro_batch) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank44]: sharded_logits = self.model( -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[def[default1]:[rank33]: return self._call_impl(*args, **kwargs) -ault4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank44]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank44]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster[default1]:[rank33]: return forward_call(*args, **kwargs) -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank33]: output = self.pp_block(**new_kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank44]: output = self.pp_block(**new_kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank44]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default[default1]:[rank33]: return self._call_impl(*args, **kwargs) -4]:[rank44]: return self._call_impl(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank44]: return forward_call(*args, **kwargs) -[default4]:[rank44]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default4]:[rank44]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default4]:[rank44]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 29.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memo[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -ry Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank39]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank39]: return self._call_impl(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank39]: return forward_call(*args, **kwargs) -[default7]:[rank39]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default7]:[rank39]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) -[default7]:[rank39]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 37.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.45 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank33]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank33]: return self._call_impl(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank33]: return forward_call(*args, **kwargs) -[default1]:[rank33]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward -[default1]:[rank33]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) -[default1]:[rank33]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 37.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.45 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank43]: Traceback (most recent call last): -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank43]: trainer.train(dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank43]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank43]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank43]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank43]: output = model(**micro_batch) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank43]: sharded_logits = self.model( -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank43]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank43]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank43]: output = self.pp_block(**new_kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank43]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank43]: return self._call_impl(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank43]: return forward_call(*args, **kwargs) -[default3]:[rank43]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default3]:[rank43]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default3]:[rank43]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank42]: Traceback (most recent call last): -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank42]: trainer.train(dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank42]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank42]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank42]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank42]: output = model(**micro_batch) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank42]: sharded_logits = self.model( -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank42]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank42]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank42]: output = self.pp_block(**new_kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank42]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank42]: return self._call_impl(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank42]: return forward_call(*args, **kwargs) -[default2]:[rank42]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default2]:[rank42]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default2]:[rank42]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 29.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank47]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: trainer.train(dataloader) -[default6]:[rank46]: Traceback (most recent call last): -[default5]:[rank45]: Traceback (most recent call last): -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank45]: trainer.train(dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank45]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank46]: trainer.train(dataloader) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank46]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank45]: output = model(**micro_batch) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank47]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: output = model(**micro_batch) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: sharded_logits = self.model( -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank47]: sharded_logits = self.model( -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank46]: sharded_logits = self.model( -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank47]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank46]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: output = self.pp_block(**new_kwargs) -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: output = self.pp_block(**new_kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default6]:[rank46]: output = self.pp_block(**new_kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank47]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank47]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank46]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank45]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank46]: return self._call_impl(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank47]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank45]: return self._call_impl(*args, **kwargs) -[default7]:[rank47]: return forward_call(*args, **kwargs) -[default5]:[rank45]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank46]: return forward_call(*args, **kwargs) -[default5]:[rank45]: return forward_call(*args, **kwargs) -[default7]:[rank47]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default5]:[rank45]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default7]:[rank47]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default7]:[rank47]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank46]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward -[default6]:[rank46]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) -[default5]:[rank45]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default5]:[rank45]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank46]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 93.94 MiB is free. Including non-PyTorch memory, this process has 79.23 GiB memory in use. Of the allocated memory 69.45 GiB is allocated by PyTorch, and 128.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank22]: Traceback (most recent call last): -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank22]: trainer.train(dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank22]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank22]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank22]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank22]: output = model(**micro_batch) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank22]: sharded_logits = self.model( -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank22]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank22]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank22]: output = self.pp_block(**new_kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank22]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank22]: return self._call_impl(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank22]: return forward_call(*args, **kwargs) -[default6]:[rank22]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default6]:[rank22]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default6]:[rank22]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank20]: Traceback (most recent call last): -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank20]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank20]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank20]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank20]: output = model(**micro_batch) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: Traceback (most recent call last): -[default4]:[rank20]: sharded_logits = self.model( -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank21]: trainer.train(dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank20]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank20]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank20]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank21]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default5]:[rank21]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank20]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank21]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: return self._call_impl(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank21]: output = model(**micro_batch) -[default4]:[rank20]: return forward_call(*args, **kwargs) -[default4]:[rank20]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default4]:[rank20]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank20]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default2]:[rank26]: Traceback (most recent call last): -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank26]: trainer.train(dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank26]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank26]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank26]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank21]: sharded_logits = self.model( -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank21]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank21]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank21]: output = self.pp_block(**new_kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank26]: output = model(**micro_batch) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank26]: sharded_logits = self.model( -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[def[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -ault2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank21]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank26]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank26]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[default5]:[rank21]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank21]: return forward_call(*args, **kwargs) -[default5]:[rank21]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default5]:[rank21]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default5]:[rank21]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 29.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cud[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank26]: output = self.pp_block(**new_kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank26]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nna.html#environment-variables) -/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank26]: return self._call_impl(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank26]: return forward_call(*args, **kwargs) -[default2]:[rank26]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 389, in forward -[default2]:[rank26]: .contiguous() -[default2]:[rank26]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU  has a total capacity of 79.33 GiB of which 165.94 MiB is free. Including non-PyTorch memory, this process has 79.16 GiB memory in use. Of the allocated memory 69.26 GiB is allocated by PyTorch, and 128.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: Traceback (most recent call last): -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank18]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank18]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: Traceback (most recent call last): -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: trainer.train(dataloader) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank17]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank17]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank18]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank17]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank17]: output = model(**micro_batch) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank18]: output = model(**micro_batch) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank17]: sharded_logits = self.model( -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: sharded_logits = self.model( -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default2]:[rank18]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: Traceback (most recent call last): -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank29]: trainer.train(dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank29]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank29]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank29]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank17]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank29]: output = model(**micro_batch) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank29]: sharded_logits = self.model( -[default1]:[rank17]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank29]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank29]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default1]:[rank17]: return self._call_impl(*args, **kwargs) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: output = self.pp_block(**new_kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank29]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank29]: return self._call_impl(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank29]: return forward_call(*args, **kwargs) -[default5]:[rank29]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default5]:[rank29]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default5]:[rank29]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 77.94 MiB is free. Including non-PyTorch memory, this process has 79.24 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by P[default1]:[rank17]: output = self.pp_block(**new_kwargs) -yTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: Traceback (most recent call last): -[default2]:[rank18]: output = self.pp_block(**new_kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: trainer.train(dataloader) -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank17]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: Traceback (most recent call last): -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank24]: trainer.train(dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank24]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default1]:[rank17]: return self._call_impl(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank17]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank17]: return forward_call(*args, **kwargs) -[default2]:[rank18]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank18]: return self._call_impl(*args, **kwargs) -[default1]:[rank17]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default2]:[rank18]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank24]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank17]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: output = model(**micro_batch) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank18]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank17]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 29.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default2]:[rank18]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default2]:[rank18]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank18]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 117.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank24]: sharded_logits = self.model( -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank24]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank24]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank24]: output = self.pp_block(**new_kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: output = model(**micro_batch) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default0]:[rank24]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank25]: sharded_logits = self.model( -[default0]:[rank24]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank25]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank24]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank25]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank24]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) -[default0]:[rank24]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank25]: output = self.pp_block(**new_kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank25]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank25]: return self._call_impl(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank25]: return forward_call(*args, **kwargs) -[default1]:[rank25]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default1]:[rank25]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default1]:[rank25]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 77.94 MiB is free. Including non-PyTorch memory, this process has 79.24 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank30]: Traceback (most recent call last): -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank30]: trainer.train(dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank30]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank30]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank30]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank30]: output = model(**micro_batch) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank30]: sharded_logits = self.model( -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank30]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank30]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank30]: output = self.pp_block(**new_kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank30]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank30]: return self._call_impl(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank30]: return forward_call(*args, **kwargs) -[default6]:[rank30]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward -[default6]:[rank30]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) -[default6]:[rank30]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 37.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.45 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank28]: Traceback (most recent call last): -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank28]: trainer.train(dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank28]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank28]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank28]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank28]: output = model(**micro_batch) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank28]: sharded_logits = self.model( -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank28]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank28]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank28]: output = self.pp_block(**new_kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank28]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank28]: return self._call_impl(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank28]: return forward_call(*args, **kwargs) -[default4]:[rank28]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward -[default4]:[rank28]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) -[default4]:[rank28]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 37.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 69.45 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_C[default0]:[rank16]: Traceback (most recent call last): -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank16]: trainer.train(dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank16]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank16]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank16]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank27]: Traceback (most recent call last): -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank27]: trainer.train(dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank27]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank27]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank27]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank27]: output = model(**micro_batch) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank27]: sharded_logits = self.model( -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[defron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank16]: output = model(**micro_batch) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -ault3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default0]:[rank16]: sharded_logits = self.model( -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank16]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank16]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank16][default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank27]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank27]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank16]: output = self.pp_block(**new_kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank27]: output = self.pp_block(**new_kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank27]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -3]:[rank27]: return self._call_impl(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank27]: return forward_call(*args, **kwargs) -[default3]:[rank27]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default3]:[rank27]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default3]:[rank27]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 13.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 128.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Mem[default0]:[rank16]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank16]: return self._call_impl(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank16]: return forward_call(*args, **kwargs) -[default0]:[rank16]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default0]:[rank16]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default0]:[rank16]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -ory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank31]: Traceback (most recent call last): -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank31]: trainer.train(dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank31]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank31]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank31]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank31]: output = model(**micro_batch) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank31]: sharded_logits = self.model( -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank31]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank31]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank31]: output = self.pp_block(**new_kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank31]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank31]: return self._call_impl(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank31]: return forward_call(*args, **kwargs) -[default7]:[rank31]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default7]:[rank31]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default7]:[rank31]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 77.94 MiB is free. Including non-PyTorch memory, this process has 79.24 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank19]: Traceback (most recent call last): -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank19]: trainer.train(dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank19]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank19]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank19]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank19]: output = model(**micro_batch) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank19]: sharded_logits = self.model( -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank19]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank19]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank19]: output = self.pp_block(**new_kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank19]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank19]: return self._call_impl(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank19]: return forward_call(*args, **kwargs) -[default3]:[rank19]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default3]:[rank19]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default3]:[rank19]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 29.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank23]: Traceback (most recent call last): -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank23]: trainer.train(dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank23]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank23]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank23]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank23]: output = model(**micro_batch) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank23]: sharded_logits = self.model( -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank23]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank23]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank23]: output = self.pp_block(**new_kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank23]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank23]: return self._call_impl(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank23]: return forward_call(*args, **kwargs) -[default7]:[rank23]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default7]:[rank23]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default7]:[rank23]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 29.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank61]: Traceback (most recent call last): -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank61]: trainer.train(dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank61]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank61]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank61]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank61]: output = model(**micro_batch) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank61]: sharded_logits = self.model( -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank61]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank61]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank61]: output = self.pp_block(**new_kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank61]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank61]: output = self.o_proj(attention_output) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank61]: return self._call_impl(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank61]: return forward_call(*args, **kwargs) -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank61]: return row_linear( -[default5]:[rank61]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank61]: out = F.linear(input, weight, bias) -[default5]:[rank61]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 563.94 MiB is free. Including non-PyTorch memory, this process has 78.77 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank60]: Traceback (most recent call last): -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank60]: trainer.train(dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank60]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank60]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank60]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank60]: output = model(**micro_batch) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank60]: sharded_logits = self.model( -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank60]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank60]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank60]: output = self.pp_block(**new_kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank60]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank60]: output = self.o_proj(attention_output) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank60]: return self._call_impl(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank60]: return forward_call(*args, **kwargs) -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank60]: return row_linear( -[default4]:[rank60]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank60]: out = F.linear(input, weight, bias) -[default4]:[rank60]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 475.94 MiB is free. Including non-PyTorch memory, this process has 78.85 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank57]: Traceback (most recent call last): -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank57]: trainer.train(dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank57]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank57]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank57]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank57]: output = model(**micro_batch) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank57]: sharded_logits = self.model( -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank57]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank57]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank57]: output = self.pp_block(**new_kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank57]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank57]: output = self.o_proj(attention_output) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank57]: return self._call_impl(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank57]: return forward_call(*args, **kwargs) -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank57]: return row_linear( -[default1]:[rank57]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank57]: out = F.linear(input, weight, bias) -[default1]:[rank57]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 563.94 MiB is free. Including non-PyTorch memory, this process has 78.77 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank59]: Traceback (most recent call last): -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank59]: trainer.train(dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank59]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank59]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank59]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank59]: output = model(**micro_batch) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank59]: sharded_logits = self.model( -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank59]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank59]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank59]: output = self.pp_block(**new_kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank59]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default3]:[rank59]: output = self.o_proj(attention_output) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank59]: return self._call_impl(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank59]: return forward_call(*args, **kwargs) -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank59]: return row_linear( -[default3]:[rank59]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default3]:[rank59]: out = F.linear(input, weight, bias) -[default3]:[rank59]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 563.94 MiB is free. Including non-PyTorch memory, this process has 78.77 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank56]: Traceback (most recent call last): -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank56]: trainer.train(dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank56]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank56]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank56]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank56]: output = model(**micro_batch) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank56]: sharded_logits = self.model( -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank56]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank56]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank56]: output = self.pp_block(**new_kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank56]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank56]: output = self.o_proj(attention_output) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank56]: return self._call_impl(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank56]: return forward_call(*args, **kwargs) -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank56]: return row_linear( -[default0]:[rank56]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank56]: out = F.linear(input, weight, bias) -[default0]:[rank56]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default7]:[rank55]: Traceback (most recent call last): -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank55]: trainer.train(dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank55]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank55]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank55]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank55]: output = model(**micro_batch) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank55]: sharded_logits = self.model( -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank55]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank55]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank55]: output = self.pp_block(**new_kwargs) -[d[default6]:[rank62]: Traceback (most recent call last): -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank62]: trainer.train(dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank62]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank62]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank62]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotefault7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank55]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank55]: return self._call_impl(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank55]: return forward_call(*args, **kwargs) -[default7]:[rank55]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default7]:[rank55]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default7]:[rank55]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 69.94 MiB is free. Including non-PyTorch memory, this process has 79.25 GiB meron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank62]: output = model(**micro_batch) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -mory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank62]: sharded_logits = self.model( -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank62]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank62]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank62]: output = self.pp_block(**new_kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank62]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank62]: output = self.o_proj(attention_output) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank62]: return self._call_impl(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank62]: return forward_call(*args, **kwargs) -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default6]:[rank62]: return row_linear( -[default6]:[rank62]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank62]: out = F.linear(input, weight, bias) -[default6]:[rank62]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 475.94 MiB is free. Including non-PyTorch memory, this process has 78.85 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank52]: Traceback (most recent call last): -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank52]: trainer.train(dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank52]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank52]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank52]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank52]: output = model(**micro_batch) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank52]: sharded_logits = self.model( -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank52]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank52]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank52]: output = self.pp_block(**new_kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank52]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank52]: output = self.o_proj(attention_output) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank52]: return self._call_impl(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank52]: return forward_call(*args, **kwargs) -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default4]:[rank52]: return row_linear( -[default4]:[rank52]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default4]:[rank52]: out = F.linear(input, weight, bias) -[default4]:[rank52]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 27.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 63.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank50]: Traceback (most recent call last): -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank50]: trainer.train(dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank50]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank50]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank49]: Traceback (most recent call last): -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank48]: Traceback (most recent call last): -[default1]:[rank49]: trainer.train(dataloader) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank50]: sharded_logits = self.model( -[default1]:[rank49]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank48]: trainer.train(dataloader) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank49]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank48]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default1]:[rank49]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: output = model(**micro_batch) -[default2]:[rank50]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: sharded_logits = self.model( -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank50]: output = self.pp_block(**new_kwargs) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default2]:[rank58]: Traceback (most recent call last): -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank58]: trainer.train(dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank58]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank58]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank58]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanot[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -ron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank58]: output = model(**micro_batch) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank58]: sharded_logits = self.model( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank58]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank58]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-[default2]:[rank50]: return self._call_impl(*args, **kwargs) -cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank58]: output = self.pp_block(**new_kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank[default0]:[rank48]: output = model(**micro_batch) -58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank58]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default2]:[rank58]: output = self.o_proj(attention_output) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluste[default1]:[rank49]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -r/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank58]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank58]: return forward_call(*args, **kwargs) -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank58]: return row_linear( -[default2]:[rank58]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default2]:[rank58]: out = F.linear(input, weight, bias) -[default2]:[rank58]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 475.94 MiB is free. Including non-PyTorch memory, this process has 78.85 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: Traceback (most recent call last): -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank63]: trainer.train(dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank63]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank63]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank63]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank49]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank63]: output = model(**micro_batch) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: sharded_logits = self.model( -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank63]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank63]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank63]: output = self.pp_block(**new_kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line[default0]:[rank48]: sharded_logits = self.model( - 631, in forward -[default7]:[rank63]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank63]: output = self.o_proj(attention_output) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank63]: return self._call_impl(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank63]: return forward_call(*args, **kwargs) -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_clust[default1]:[rank49]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -er/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank63]: return row_linear( -[default7]:[rank63]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank63]: out = F.linear(input, weight, bias) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank63]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 563.94 MiB is free. Including non-PyTorch memory, this process has 78.77 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: output = self.pp_block(**new_kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default0]:[rank48]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank49]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank48]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: output = self.o_proj(attention_output) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank49]: return forward_call(*args, **kwargs) -[default2]:[rank50]: return self._call_impl(*args, **kwargs) -[default1]:[rank49]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank50]: return forward_call(*args, **kwargs) -[default0]:[rank48]: output = self.pp_block(**new_kwargs) -[default1]:[rank49]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank50]: return row_linear( -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank49]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 69.94 MiB is free. Including non-PyTorch memory, this process has 79.25 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default2]:[rank50]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank50]: out = F.linear(input, weight, bias) -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default2]:[rank50]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 27.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 63.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank48]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank48]: output = self.o_proj(attention_output) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank48]: return self._call_impl(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank48]: return forward_call(*args, **kwargs) -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank48]: return row_linear( -[default0]:[rank48]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default0]:[rank48]: out = F.linear(input, weight, bias) -[default0]:[rank48]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default3]:[rank51]: Traceback (most recent call last): -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: Traceback (most recent call last): -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank53]: trainer.train(dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: trainer.train(dataloader) -[default5]:[rank53]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank53]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: output = model(**micro_batch) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank53]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank51]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank51]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank53]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank51]: output = model(**micro_batch) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank51]: sharded_logits = self.model( -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank51]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank53]: output = self.pp_block(**new_kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank51]: output = self.pp_block(**new_kwargs) -[default5]:[rank53]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank53]: return self._call_impl(*args, **kwargs) -[default5]:[rank53]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank51]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank51]: return self._call_impl(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank53]: return forward_call(*args, **kwargs) -[default3]:[rank51]: return forward_call(*args, **kwargs) -[default3]:[rank51]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default5]:[rank53]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default3]:[rank51]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default3]:[rank51]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 69.94 MiB is free. Including non-PyTorch memory, this process has 79.25 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank53]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default5]:[rank53]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 69.94 MiB is free. Including non-PyTorch memory, this process has 79.25 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank54]: Traceback (most recent call last): -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank54]: trainer.train(dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank54]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank54]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank54]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank54]: output = model(**micro_batch) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank54]: sharded_logits = self.model( -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank54]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank54]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank54]: output = self.pp_block(**new_kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank54]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank54]: return self._call_impl(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank54]: return forward_call(*args, **kwargs) -[default6]:[rank54]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default6]:[rank54]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default6]:[rank54]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 93.94 MiB is free. Including non-PyTorch memory, this process has 79.23 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 128.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank3]: Traceback (most recent call last): -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank6]: Traceback (most recent call last): -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank7]: Traceback (most recent call last): -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank3]: trainer.train(dataloader) -[default7]:[rank7]: trainer.train(dataloader) -[default6]:[rank6]: trainer.train(dataloader) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: output = model(**micro_batch) -[default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank6]: output = model(**micro_batch) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: sharded_logits = self.model( -[default3]:[rank3]: sharded_logits = self.model( -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: output = model(**micro_batch) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank7]: sharded_logits = self.model( -[default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: output = self.pp_block(**new_kwargs) -[default6]:[rank6]: output = self.pp_block(**new_kwargs) -[default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: output = self.pp_block(**new_kwargs) -[default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default3]:[rank3]: output = self.o_proj(attention_output) -[default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank3]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default6]:[rank6]: return self._call_impl(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: return forward_call(*args, **kwargs) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default6]:[rank6]: return forward_call(*args, **kwargs) -[default3]:[rank3]: return row_linear( -[default7]:[rank7]: output = self.o_proj(attention_output) -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: return row_linear( -[default3]:[rank3]: out = F.linear(input, weight, bias) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default6]:[rank6]: out = F.linear(input, weight, bias) -[default7]:[rank7]: return self._call_impl(*args, **kwargs) -[default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 475.94 MiB is free. Including non-PyTorch memory, this process has 78.85 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 563.94 MiB is free. Including non-PyTorch memory, this process has 78.77 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank7]: return forward_call(*args, **kwargs) -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank7]: return row_linear( -[default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank7]: out = F.linear(input, weight, bias) -[default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 475.94 MiB is free. Including non-PyTorch memory, this process has 78.85 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank5]: Traceback (most recent call last): -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default5]:[rank5]: trainer.train(dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank5]: output = model(**micro_batch) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank5]: sharded_logits = self.model( -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank5]: output = self.pp_block(**new_kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank5]: output = self.o_proj(attention_output) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank5]: return self._call_impl(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank5]: return forward_call(*args, **kwargs) -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank5]: return row_linear( -[default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank5]: out = F.linear(input, weight, bias) -[default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 475.94 MiB is free. Including non-PyTorch memory, this process has 78.85 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default6]:[rank14]: Traceback (most recent call last): -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default6]:[rank14]: trainer.train(dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter( -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default6]:[rank14]: output = model(**micro_batch) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default6]:[rank14]: sharded_logits = self.model( -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default6]:[rank14]: output = self.pp_block(**new_kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default6]:[rank14]: return self._call_impl(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default6]:[rank14]: return forward_call(*args, **kwargs) -[default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default6]:[rank14]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 5.94 MiB is free. Including non-PyTorch memory, this process has 79.31 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 128.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default4]:[rank4]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank4]: trainer.train(dataloader) -[default0]:[rank0]: Traceback (most recent call last): -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: trainer.train(dataloader) -[default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: output = model(**micro_batch) -[default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank4]: sharded_logits = self.model( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: output = model(**micro_batch) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank4]: output = self.pp_block(**new_kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank0]: sharded_logits = self.model( -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank0]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: Traceback (most recent call last): -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: trainer.train(dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank4]: output = self.o_proj(attention_output) -[default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default4]:[rank4]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: return forward_call(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: output = model(**micro_batch) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank0]: output = self.o_proj(attention_output) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default4]:[rank4]: return row_linear( -[default0]:[rank0]: return self._call_impl(*args, **kwargs) -[default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default1]:[rank1]: Traceback (most recent call last): -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank2]: sharded_logits = self.model( -[default1]:[rank1]: trainer.train(dataloader) -[default4]:[rank4]: out = F.linear(input, weight, bias) -[default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 563.94 MiB is free. Including non-PyTorch memory, this process has 78.77 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank0]: return forward_call(*args, **kwargs) -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default0]:[rank0]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank0]: out = F.linear(input, weight, bias) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU -[default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: output = model(**micro_batch) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank2]: output = self.pp_block(**new_kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank1]: sharded_logits = self.model( -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default1]:[rank1]: output = self.pp_block(**new_kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank2]: output = self.o_proj(attention_output) -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank2]: return forward_call(*args, **kwargs) -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default2]:[rank2]: return row_linear( -[default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default2]:[rank2]: out = F.linear(input, weight, bias) -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank1]: output = self.o_proj(attention_output) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank1]: return self._call_impl(*args, **kwargs) -[default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank1]: return forward_call(*args, **kwargs) -[default0]:[rank8]: Traceback (most recent call last): -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default0]:[rank8]: trainer.train(dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter( -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/na[default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 563.94 MiB is free. Including non-PyTorch memory, this process has 78.77 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank1]: return row_linear( -[default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank1]: out = F.linear(input, weight, bias) -[default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 475.94 MiB is free. Including non-PyTorch memory, this process has 78.85 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 127.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -notron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default0]:[rank8]: output = model(**micro_batch) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default0]:[rank8]: sharded_logits = self.model( -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default0]:[rank8]: output = self.pp_block(**new_kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default0]:[rank8]: return self._call_impl(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default0]:[rank8]: return forward_call(*args, **kwargs) -[default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default0]:[rank8]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU -[default4]:[rank12]: Traceback (most recent call last): -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default4]:[rank12]: trainer.train(dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter( -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default4]:[rank12]: output = model(**micro_batch) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default4]:[rank12]: sharded_logits = self.model( -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default4]:[rank12]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default4]:[rank12]: output = self.pp_block(**new_kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default4]:[rank12]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default4]:[rank12]: return self._call_impl(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default4]:[rank12]: return forward_call(*args, **kwargs) -[default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default4]:[rank12]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 5.94 MiB is free. Including non-PyTorch memory, this process has 79.31 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 128.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default3]:[rank11]: Traceback (most recent call last): -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default3]:[rank11]: trainer.train(dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter( -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default3]:[rank11]: output = model(**micro_batch) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default3]:[rank11]: sharded_logits = self.model( -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default3]:[rank11]: output = self.pp_block(**new_kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default3]:[rank11]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default3]:[rank11]: return self._call_impl(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default3]:[rank11]: return forward_call(*args, **kwargs) -[default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default3]:[rank11]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 93.94 MiB is free. Including non-PyTorch memory, this process has 79.23 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 128.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default7]:[rank15]: Traceback (most recent call last): -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default7]:[rank15]: trainer.train(dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default7]:[rank15]: output = model(**micro_batch) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default7]:[rank15]: sharded_logits = self.model( -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default7]:[rank15]: output = self.pp_block(**new_kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default7]:[rank15]: output = self.o_proj(attention_output) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default7]:[rank15]: return self._call_impl(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default7]:[rank15]: return forward_call(*args, **kwargs) -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default7]:[rank15]: return row_linear( -[default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default7]:[rank15]: out = F.linear(input, weight, bias) -[default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 27.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 63.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default2]:[rank10]: Traceback (most recent call last): -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default2]:[rank10]: trainer.train(dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter( -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default2]:[rank10]: output = model(**micro_batch) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default2]:[rank10]: sharded_logits = self.model( -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default2]:[rank10]: output = self.pp_block(**new_kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default2]:[rank10]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default2]:[rank10]: return self._call_impl(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default2]:[rank10]: return forward_call(*args, **kwargs) -[default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward -[default2]:[rank10]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() -[default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 69.94 MiB is free. Including non-PyTorch memory, this process has 79.25 GiB memory in use. Of the allocated memory 69.57 GiB is allocated by PyTorch, and 64.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default1]:[rank9]: Traceback (most recent call last): -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: trainer.train(dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: Traceback (most recent call last): -[default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default1]:[rank9]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: trainer.train(dataloader) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train -[default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default1]:[rank9]: sharded_logits = self.model( -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step -[default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter -[default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward -[default5]:[rank13]: output = model(**micro_batch) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward -[default5]:[rank13]: sharded_logits = self.model( -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward -[default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0] -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward -[default5]:[rank13]: output = self.pp_block(**new_kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward -[default1]:[rank9]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask) -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: output = self.o_proj(attention_output) -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default1]:[rank9]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default1]:[rank9]: return forward_call(*args, **kwargs) -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default1]:[rank9]: return row_linear( -[default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default1]:[rank9]: out = F.linear(input, weight, bias) -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 598, in forward -[default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 27.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 63.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[default5]:[rank13]: output = self.o_proj(attention_output) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl -[default5]:[rank13]: return self._call_impl(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl -[default5]:[rank13]: return forward_call(*args, **kwargs) -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward -[default5]:[rank13]: return row_linear( -[default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear -[default5]:[rank13]: out = F.linear(input, weight, bias) -[default5]:[rank13]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 27.94 MiB is free. Including non-PyTorch memory, this process has 79.29 GiB memory in use. Of the allocated memory 69.70 GiB is allocated by PyTorch, and 63.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -E0703 01:01:41.603000 140709557507904 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1398940) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:01:41 - host : ip-26-0-162-233.ec2.internal - rank : 33 (local_rank: 1) - exitcode : 1 (pid: 1398941) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_01:01:41 - host : ip-26-0-162-233.ec2.internal - rank : 34 (local_rank: 2) - exitcode : 1 (pid: 1398942) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_01:01:41 - host : ip-26-0-162-233.ec2.internal - rank : 35 (local_rank: 3) - exitcode : 1 (pid: 1398943) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_01:01:41 - host : ip-26-0-162-233.ec2.internal - rank : 36 (local_rank: 4) - exitcode : 1 (pid: 1398944) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_01:01:41 - host : ip-26-0-162-233.ec2.internal - rank : 37 (local_rank: 5) - exitcode : 1 (pid: 1398945) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_01:01:41 - host : ip-26-0-162-233.ec2.internal - rank : 38 (local_rank: 6) - exitcode : 1 (pid: 1398946) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_01:01:41 - host : ip-26-0-162-233.ec2.internal - rank : 39 (local_rank: 7) - exitcode : 1 (pid: 1398947) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:01:41 - host : ip-26-0-162-233.ec2.internal - rank : 32 (local_rank: 0) - exitcode : 1 (pid: 1398940) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -E0703 01:01:41.701000 139854410950464 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1139831) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 01:01:41.701000 140540211869504 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1418502) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 01:01:41.703000 140092964255552 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3762966) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 01:01:41.703000 140324900312896 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 880579) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 01:01:41.703000 140432057993024 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 867197) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 01:01:41.704000 140015942473536 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1776595) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -E0703 01:01:41.706000 140574819604288 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3891512) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10 -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-88.ec2.internal - rank : 57 (local_rank: 1) - exitcode : 1 (pid: 880580) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-88.ec2.internal - rank : 58 (local_rank: 2) - exitcode : 1 (pid: 880581) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-88.ec2.internal - rank : 59 (local_rank: 3) - exitcode : 1 (pid: 880582) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-88.ec2.internal - rank : 60 (local_rank: 4) - exitcode : 1 (pid: 880583) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-88.ec2.internal - rank : 61 (local_rank: 5) - exitcode : 1 (pid: 880584) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-88.ec2.internal - rank : 62 (local_rank: 6) - exitcode : 1 (pid: 880585) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-88.ec2.internal - rank : 63 (local_rank: 7) - exitcode : 1 (pid: 880586) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-88.ec2.internal - rank : 56 (local_rank: 0) - exitcode : 1 (pid: 880579) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-102.ec2.internal - rank : 41 (local_rank: 1) - exitcode : 1 (pid: 3762967) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-102.ec2.internal - rank : 42 (local_rank: 2) - exitcode : 1 (pid: 3762968) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-102.ec2.internal - rank : 43 (local_rank: 3) - exitcode : 1 (pid: 3762969) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-102.ec2.internal - rank : 44 (local_rank: 4) - exitcode : 1 (pid: 3762970) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-102.ec2.internal - rank : 45 (local_rank: 5) - exitcode : 1 (pid: 3762971) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-102.ec2.internal - rank : 46 (local_rank: 6) - exitcode : 1 (pid: 3762972) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-102.ec2.internal - rank : 47 (local_rank: 7) - exitcode : 1 (pid: 3762973) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/st run(args) -able/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-102.ec2.internal - rank : 40 (local_rank: 0) - exitcode : 1 (pid: 3762966) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-78.ec2.internal - rank : 25 (local_rank: 1) - exitcode : 1 (pid: 1139832) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-78.ec2.internal - rank : 26 (local_rank: 2) - exitcode : 1 (pid: 1139833) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-78.ec2.internal - rank : 27 (local_rank: 3) - exitcode : 1 (pid: 1139834) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-78.ec2.internal - rank : 28 (local_rank: 4) - exitcode : 1 (pid: 1139835) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-78.ec2.internal - rank : 29 (local_rank: 5) - exitcode : 1 (pid: 1139836) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-78.ec2.internal - rank : 30 (local_rank: 6) - exitcode : 1 (pid: 1139837) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-78.ec2.internal - rank : 31 (local_rank: 7) - exitcode : 1 (pid: 1139838) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-78.ec2.internal - rank : 24 (local_rank: 0) - exitcode : 1 (pid: 1139831) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:01:41 - host : ip-26-0-160-225.ec2.internal - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1776596) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_01:01:41 - host : ip-26-0-160-225.ec2.internal - rank : 2 (local_rank: 2) - exitcode : 1 (pid: 1776597) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_01:01:41 - host : ip-26-0-160-225.ec2.internal - rank : 3 (local_rank: 3) - exitcode : 1 (pid: 1776598) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_01:01:41 - host : ip-26-0-160-225.ec2.internal - rank : 4 (local_rank: 4) - exitcode : 1 (pid: 1776599) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_01:01:41 - host : ip-26-0-160-225.ec2.internal - rank : 5 (local_rank: 5) - exitcode : 1 (pid: 1776600) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_01:01:41 - host : ip-26-0-160-225.ec2.internal - rank : 6 (local_rank: 6) - exitcode : 1 (pid: 1776601) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_01:01:41 - host : ip-26-0-160-225.ec2.internal - rank : 7 (local_rank: 7) - exitcode : 1 (pid: 1776602) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:01:41 - host : ip-26-0-160-225.ec2.internal - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 1776595) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - raise ChildFailedError( -Traceback (most recent call last): - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-62.ec2.internal - rank : 49 (local_rank: 1) - exitcode : 1 (pid: 3891513) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-62.ec2.internal - rank : 50 (local_rank: 2) - exitcode : 1 (pid: 3891514) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-62.ec2.internal - rank : 51 (local_rank: 3) - exitcode : 1 (pid: 3891515) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-62.ec2.internal - rank : 52 (local_rank: 4) - exitcode : 1 (pid: 3891516) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-62.ec2.internal - rank : 53 (local_rank: 5) - exitcode : 1 (pid: 3891517) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-62.ec2.internal - rank : 54 (local_rank: 6) - exitcode : 1 (pid: 3891518) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-62.ec2.internal - rank : 55 (local_rank: 7) - exitcode : 1 (pid: 3891519) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:01:41 - host : ip-26-0-171-62.ec2.internal - rank : 48 (local_rank: 0) - exitcode : 1 (pid: 3891512) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - sys.exit(main()) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - return f(*args, **kwargs) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-103.ec2.internal - rank : 9 (local_rank: 1) - exitcode : 1 (pid: 867198) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-103.ec2.internal - rank : 10 (local_rank: 2) - exitcode : 1 (pid: 867199) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-103.ec2.internal - rank : 11 (local_rank: 3) - exitcode : 1 (pid: 867200) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-103.ec2.internal - rank : 12 (local_rank: 4) - exitcode : 1 (pid: 867201) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-103.ec2.internal - rank : 13 (local_rank: 5) - exitcode : 1 (pid: 867202) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-103.ec2.internal - rank : 14 (local_rank: 6) - exitcode : 1 (pid: 867203) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-103.ec2.internal - rank : 15 (local_rank: 7) - exitcode : 1 (pid: 867204) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-103.ec2.internal - rank : 8 (local_rank: 0) - exitcode : 1 (pid: 867197) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ - run(args) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run - elastic_launch( - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-153.ec2.internal - rank : 17 (local_rank: 1) - exitcode : 1 (pid: 1418503) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[2]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-153.ec2.internal - rank : 18 (local_rank: 2) - exitcode : 1 (pid: 1418504) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[3]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-153.ec2.internal - rank : 19 (local_rank: 3) - exitcode : 1 (pid: 1418505) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[4]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-153.ec2.internal - rank : 20 (local_rank: 4) - exitcode : 1 (pid: 1418506) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[5]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-153.ec2.internal - rank : 21 (local_rank: 5) - exitcode : 1 (pid: 1418507) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[6]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-153.ec2.internal - rank : 22 (local_rank: 6) - exitcode : 1 (pid: 1418508) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -[7]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-153.ec2.internal - rank : 23 (local_rank: 7) - exitcode : 1 (pid: 1418509) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-07-03_01:01:41 - host : ip-26-0-161-153.ec2.internal - rank : 16 (local_rank: 0) - exitcode : 1 (pid: 1418502) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: ip-26-0-162-233: task 4: Exited with exit code 1 -srun: error: ip-26-0-160-225: task 0: Exited with exit code 1 -srun: error: ip-26-0-161-153: task 3: Exited with exit code 1 -srun: error: ip-26-0-161-78: task 1: Exited with exit code 1 -srun: error: ip-26-0-171-102: task 7: Exited with exit code 1 -srun: error: ip-26-0-171-88: task 6: Exited with exit code 1 -srun: error: ip-26-0-171-62: task 5: Exited with exit code 1 -srun: error: ip-26-0-161-103: task 2: Exited with exit code 1 -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/status.txt b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/status.txt deleted file mode 100644 index d9a90241ad66cdfd6a6396003e9a377075dda105..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-64/status.txt +++ /dev/null @@ -1 +0,0 @@ -oom \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/bench.slurm b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/bench.slurm deleted file mode 100644 index 6418aa4cf177a7323bea624d4414ccdcb355301c..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/bench.slurm +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=bench_cluster -#SBATCH --time=01:30:00 -#SBATCH --partition=hopper-prod -#SBATCH --nodes=8 -#SBATCH --gres=gpu:8 -#SBATCH --qos=high -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=96 -#SBATCH --exclusive -#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/log.out -#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/log.out - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh -conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster -echo python3 version = $(python3 --version) -echo "========================" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" - -huggingface-cli login --token $HUGGINGFACE_TOKEN - - -NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron" -CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/config.yaml" - -LAUNCHER="torchrun \ - --nproc_per_node 8 \ - --nnodes 8 \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --tee 3 \ - --node_rank ${SLURM_PROCID}" - -# Checkout the bench_cluster branch -cd $NANOTRON_REPO -git checkout bench_cluster -cd .. -# Get the current job ID -job_id=${SLURM_JOB_ID} - -# Update status to "pending" or "running" in the background -update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/status.txt & - -# Run the main command -srun -u $LAUNCHER $CMD -exit_status=$? - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/status.txt -else - if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/status.txt - elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/log.out; then - printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/status.txt - elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/log.out; then - printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/status.txt - else - printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/status.txt - fi -fi - -# Run the report script if the job completed successfully -if [ $exit_status -eq 0 ]; then - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8 --is_logs - python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8 --is_profiler -fi - - -# Push to hub the folder using huggingface_cli -huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8 llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8 --commit-message "Upload llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8" - -# Verify the upload -if [ $? -eq 0 ]; then - echo "Uploading to Huggingface Hub successful" -else - echo "Failed to upload to Huggingface Hub" -fi \ No newline at end of file diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/config.yaml b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/config.yaml deleted file mode 100644 index b87255033bfe3c81829dee0d469dbf38b8d62a89..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/config.yaml +++ /dev/null @@ -1,90 +0,0 @@ -general: - project: bench_cluster - seed: 42 -model: - ddp_bucket_cap_mb: 25 - dtype: bfloat16 - init_method: - std: 0.025 - make_vocab_size_divisible_by: 1 - model_config: - bos_token_id: 1 - eos_token_id: 2 - hidden_act: silu - hidden_size: 2048 - initializer_range: 0.02 - intermediate_size: 4096 - is_llama_config: true - max_position_embeddings: 4096 - num_attention_heads: 32 - num_hidden_layers: 24 - num_key_value_heads: 32 - pad_token_id: null - pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null - rope_theta: 10000.0 - tie_word_embeddings: true - use_cache: true - vocab_size: 50257 -optimizer: - accumulate_grad_in_fp32: true - clip_grad: 1.0 - learning_rate_scheduler: - learning_rate: 0.0001 - lr_decay_style: linear - lr_warmup_style: linear - lr_warmup_steps: 1 - min_decay_lr: 1.0e-05 - optimizer_factory: - adam_beta1: 0.9 - adam_beta2: 0.95 - adam_eps: 1.0e-08 - name: adamW - torch_adam_is_fused: true - weight_decay: 0.01 - zero_stage: 1 -parallelism: - dp: 4 - expert_parallel_size: 1 - pp: 1 - pp_engine: 1f1b - tp: 16 - tp_linear_async_communication: false - tp_mode: REDUCE_SCATTER -profiler: - profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8 -tokenizer: - tokenizer_max_length: null - tokenizer_name_or_path: openai-community/gpt2 - tokenizer_revision: null -data_stages: -- name: Training Stage - start_training_step: 1 - data: - dataset: - dataset_overwrite_cache: false - dataset_processing_num_proc_per_process: 64 - hf_dataset_config_name: null - hf_dataset_or_datasets: roneneldan/TinyStories - hf_dataset_splits: train - text_column_name: text - num_loading_workers: 0 - seed: 42 -lighteval: null -tokens: - train_steps: 20 - val_check_interval: -1 - batch_accumulation_per_replica: 32 - limit_test_batches: 0 - limit_val_batches: 0 - micro_batch_size: 8 - sequence_length: 4096 -logging: - iteration_step_info_interval: 1 - log_level: info - log_level_replica: info -checkpoints: - checkpoint_interval: 100000 - checkpoints_path: /dev/null - resume_checkpoint_path: null diff --git a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/log.out b/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/log.out deleted file mode 100644 index 24e886144df10a1fcf467d08724303b52aa1d561..0000000000000000000000000000000000000000 --- a/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/log.out +++ /dev/null @@ -1,721 +0,0 @@ -======================== -START TIME: Wed Jul 3 03:58:08 UTC 2024 -python3 version = Python 3.10.14 -======================== -The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. -Token is valid (permission: write). -Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token -Login successful -Already on 'bench_cluster' -M examples/config_tiny_llama.py -M examples/config_tiny_llama.yaml -M examples/train_tiny_llama.sh -M src/nanotron/models/llama.py -M src/nanotron/trainer.py -Your branch is up to date with 'origin/bench_cluster'. -Job status: RUNNING -W0703 03:58:14.029000 140579128530752 torch/distributed/run.py:757] -W0703 03:58:14.029000 140579128530752 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.029000 140579128530752 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:58:14.029000 140579128530752 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.030000 140655886456640 torch/distributed/run.py:757] -W0703 03:58:14.030000 140655886456640 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.030000 140655886456640 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:58:14.030000 140655886456640 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.070000 140096564365120 torch/distributed/run.py:757] -W0703 03:58:14.070000 140096564365120 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.070000 140096564365120 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:58:14.070000 140096564365120 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.072000 140280966899520 torch/distributed/run.py:757] -W0703 03:58:14.072000 140280966899520 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.072000 140280966899520 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:58:14.072000 140280966899520 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.105000 139900357510976 torch/distributed/run.py:757] -W0703 03:58:14.105000 139900357510976 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.105000 139900357510976 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:58:14.105000 139900357510976 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.197000 140708830381888 torch/distributed/run.py:757] -W0703 03:58:14.197000 140708830381888 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.197000 140708830381888 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:58:14.197000 140708830381888 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.675000 140400142636864 torch/distributed/run.py:757] -W0703 03:58:14.675000 140400142636864 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.675000 140400142636864 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:58:14.675000 140400142636864 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.774000 140463784273728 torch/distributed/run.py:757] -W0703 03:58:14.774000 140463784273728 torch/distributed/run.py:757] ***************************************** -W0703 03:58:14.774000 140463784273728 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0703 03:58:14.774000 140463784273728 torch/distributed/run.py:757] ***************************************** -[default0]:07/03/2024 03:58:39 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Vocab Size Padding] Padded vocab (size: 50257) with 15 dummy tokens (new size: 50272) -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config: -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Config(general=GeneralArgs(project='bench_cluster', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: run='%date_%jobid', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: step=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: consumed_train_samples=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: benchmark_csv_path=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ignore_sanity_checks=True), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: parallelism=ParallelismArgs(dp=4, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp=1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp=16, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pp_engine=, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_mode=, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tp_linear_async_communication=False, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: expert_parallel_size=1), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: init_method=RandomInit(std=0.025), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dtype=torch.bfloat16, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: make_vocab_size_divisible_by=1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: ddp_bucket_cap_mb=25), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_revision=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokenizer_max_length=None), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoint_interval=100000, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: save_initial_state=False, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: resume_checkpoint_path=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: checkpoints_path_is_shared_file_system=False), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: logging=LoggingArgs(log_level='info', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: log_level_replica='info', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration_step_info_interval=1), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tokens=TokensArgs(sequence_length=4096, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: train_steps=20, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: micro_batch_size=8, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: batch_accumulation_per_replica=32, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: val_check_interval=-1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_val_batches=0, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: limit_test_batches=0), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta1=0.9, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: adam_beta2=0.95, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: torch_adam_is_fused=True, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: name='adamW'), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: zero_stage=1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: weight_decay=0.01, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: clip_grad=1.0, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: accumulate_grad_in_fp32=True, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_steps=1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_warmup_style='linear', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_style='linear', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_steps=19, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lr_decay_starting_step=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: min_decay_lr=1e-05)), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data_stages=[DatasetStageArgs(name='Training Stage', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: start_training_step=1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_splits='train', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hf_dataset_config_name=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_processing_num_proc_per_process=64, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: dataset_overwrite_cache=False, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: text_column_name='text'), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: seed=42, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_loading_workers=0))], -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8')), -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: lighteval=None) -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Model Config: -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: LlamaConfig(bos_token_id=1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: eos_token_id=2, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_act='silu', -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: hidden_size=2048, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: initializer_range=0.02, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: intermediate_size=4096, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: is_llama_config=True, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: max_position_embeddings=4096, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_attention_heads=32, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_hidden_layers=24, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: num_key_value_heads=32, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pad_token_id=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: pretraining_tp=1, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rms_norm_eps=1e-05, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_scaling=None, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: rope_theta=10000.0, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: tie_word_embeddings=True, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: use_cache=True, -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: vocab_size=50272) -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Building model.. -[default0]:07/03/2024 03:58:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Setting PP block ranks... -[default4]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=12|ip-26-0-173-202]: No checkpoint path provided. -[default6]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=14|ip-26-0-173-202]: No checkpoint path provided. -[default7]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=15|ip-26-0-173-202]: No checkpoint path provided. -[default5]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=13|ip-26-0-173-202]: No checkpoint path provided. -[default3]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=11|ip-26-0-173-202]: No checkpoint path provided. -[default0]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=8|ip-26-0-173-202]: No checkpoint path provided. -[default1]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=9|ip-26-0-173-202]: No checkpoint path provided. -[default2]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=10|ip-26-0-173-202]: No checkpoint path provided. -[default2]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=2|ip-26-0-166-125]: No checkpoint path provided. -[default0]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=0|ip-26-0-166-125]: No checkpoint path provided. -[default1]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=1|ip-26-0-166-125]: No checkpoint path provided. -[default3]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=3|ip-26-0-166-125]: No checkpoint path provided. -[default4]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=4|ip-26-0-166-125]: No checkpoint path provided. -[default6]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=6|ip-26-0-166-125]: No checkpoint path provided. -[default5]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=5|ip-26-0-166-125]: No checkpoint path provided. -[default7]:07/03/2024 03:58:56 [INFO|DP=2|PP=0|TP=7|ip-26-0-166-125]: No checkpoint path provided. -[default3]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=11|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=8|ip-26-0-174-36]: No checkpoint path provided. -[default0]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Total number of parameters: 1.11G (2119.44MiB) -[default0]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default0]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Parametrizing model parameters using StandardParametrizator -[default4]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=4|ip-26-0-173-246]: No checkpoint path provided. -[default7]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=7|ip-26-0-173-246]: No checkpoint path provided. -[default3]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=3|ip-26-0-173-246]: No checkpoint path provided. -[default7]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=7|ip-26-0-162-233]: No checkpoint path provided. -[default7]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: Local number of parameters: 69.4M (132.46MiB) -[default7]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default7]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=15|ip-26-0-163-147]: No checkpoint path provided. -[default2]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=10|ip-26-0-163-147]: No checkpoint path provided. -[default6]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=14|ip-26-0-163-147]: No checkpoint path provided. -[default4]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=12|ip-26-0-163-147]: No checkpoint path provided. -[default3]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=11|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: Local number of parameters: 69.4M (132.46MiB) -[default0]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=9|ip-26-0-163-147]: No checkpoint path provided. -[default0]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=8|ip-26-0-163-147]: No checkpoint path provided. -[default5]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=13|ip-26-0-163-147]: No checkpoint path provided. -[default1]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=9|ip-26-0-174-36]: No checkpoint path provided. -[default4]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=12|ip-26-0-174-36]: No checkpoint path provided. -[default2]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: Local number of parameters: 69.4M (132.46MiB) -[default2]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default2]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=2|ip-26-0-162-233]: No checkpoint path provided. -[default4]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: Local number of parameters: 69.4M (132.46MiB) -[default4]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default4]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=4|ip-26-0-162-233]: No checkpoint path provided. -[default3]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: Local number of parameters: 69.4M (132.46MiB) -[default3]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default3]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=3|ip-26-0-162-233]: No checkpoint path provided. -[default1]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: Local number of parameters: 69.4M (132.46MiB) -[default1]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default1]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=1|ip-26-0-162-233]: No checkpoint path provided. -[default6]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=6|ip-26-0-173-246]: No checkpoint path provided. -[default0]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=0|ip-26-0-173-246]: No checkpoint path provided. -[default6]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=14|ip-26-0-174-36]: No checkpoint path provided. -[default2]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=10|ip-26-0-174-36]: No checkpoint path provided. -[default7]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=15|ip-26-0-174-36]: No checkpoint path provided. -[default5]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: Local number of parameters: 69.4M (132.46MiB) -[default5]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default5]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=5|ip-26-0-162-233]: No checkpoint path provided. -[default5]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=5|ip-26-0-173-246]: No checkpoint path provided. -[default2]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=2|ip-26-0-173-246]: No checkpoint path provided. -[default5]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=13|ip-26-0-174-36]: No checkpoint path provided. -[default1]:07/03/2024 03:58:56 [INFO|DP=3|PP=0|TP=1|ip-26-0-173-246]: No checkpoint path provided. -[default6]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: Local number of parameters: 69.4M (132.46MiB) -[default6]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: [After model building] Memory usage: 159.71MiB. Peak allocated: 174.02MiB Peak reserved: 178.00MiB -[default6]:07/03/2024 03:58:56 [INFO|DP=0|PP=0|TP=6|ip-26-0-162-233]: No checkpoint path provided. -[default0]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=8|ip-26-0-165-24]: No checkpoint path provided. -[default6]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=14|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=9|ip-26-0-165-24]: No checkpoint path provided. -[default2]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=10|ip-26-0-165-24]: No checkpoint path provided. -[default3]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=11|ip-26-0-165-24]: No checkpoint path provided. -[default7]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=15|ip-26-0-165-24]: No checkpoint path provided. -[default5]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=13|ip-26-0-165-24]: No checkpoint path provided. -[default4]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=12|ip-26-0-165-24]: No checkpoint path provided. -[default1]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=1|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=0|ip-26-0-164-207]: No checkpoint path provided. -[default2]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=2|ip-26-0-164-207]: No checkpoint path provided. -[default3]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=3|ip-26-0-164-207]: No checkpoint path provided. -[default4]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=4|ip-26-0-164-207]: No checkpoint path provided. -[default7]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=7|ip-26-0-164-207]: No checkpoint path provided. -[default6]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=6|ip-26-0-164-207]: No checkpoint path provided. -[default5]:07/03/2024 03:58:57 [INFO|DP=1|PP=0|TP=5|ip-26-0-164-207]: No checkpoint path provided. -[default0]:07/03/2024 03:58:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Optimizer Building] Using LearningRateForSP as learning rate -[default0]:07/03/2024 03:58:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] Size of optimizer params per rank: -[default0]:07/03/2024 03:58:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 0 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:58:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 1 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:58:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 2 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:58:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [ZeRO sharding] DP Rank 3 has 17.4M out of 69.4M (25.00%) params' optimizer states -[default0]:07/03/2024 03:59:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples -[default0]:07/03/2024 03:59:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Using `datasets` library -[default0]:07/03/2024 03:59:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4') -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:59:00 [WARNING|DP=0|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:59:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Training Plan] There are 1 training stages -[default0]:07/03/2024 03:59:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Stage Training Stage] start from step 1 -[default0]:07/03/2024 03:59:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: -[default0]:07/03/2024 03:59:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: [Start training] datetime: 2024-07-03 03:59:02.663478 | mbs: 8 | grad_accum: 32 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0 -[default0]:07/03/2024 03:59:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps -[default0]:07/03/2024 03:59:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 490.87MiB. Peak allocated 490.87MiB. Peak reserved: 512.00MiB -[default5]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=13|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=10|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=11|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=8|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=4|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=7|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=3|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=7|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=12|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=11|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=12|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=8|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=10|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=3|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=2|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=1|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=5|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=15|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=10|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=14|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=2|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=5|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=12|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=1|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=13|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=7|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=3|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=6|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=5|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=6|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=7|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=5|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=14|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=8|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=11|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=12|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=8|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=9|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=10|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=15|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=13|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=14|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=9|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=0|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=9|ip-26-0-174-36]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=15|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:59:02 [WARNING|DP=0|PP=0|TP=4|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=0|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:59:02 [WARNING|DP=3|PP=0|TP=6|ip-26-0-173-246]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=13|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default0]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=0|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=1|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=3|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=4|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default5]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:59:02 [WARNING|DP=1|PP=0|TP=6|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default0]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default4]:07/03/2024 03:59:02 [WARNING|DP=2|PP=0|TP=4|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default4]:Repo card metadata block was not found. Setting CardData to empty. -[default7]:07/03/2024 03:59:03 [WARNING|DP=2|PP=0|TP=15|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default7]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:07/03/2024 03:59:03 [WARNING|DP=2|PP=0|TP=14|ip-26-0-173-202]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:59:03 [WARNING|DP=2|PP=0|TP=2|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default1]:07/03/2024 03:59:03 [WARNING|DP=2|PP=0|TP=1|ip-26-0-166-125]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default1]:Repo card metadata block was not found. Setting CardData to empty. -[default3]:07/03/2024 03:59:03 [WARNING|DP=1|PP=0|TP=11|ip-26-0-165-24]: Repo card metadata block was not found. Setting CardData to empty. -[default3]:Repo card metadata block was not found. Setting CardData to empty. -[default2]:07/03/2024 03:59:13 [WARNING|DP=2|PP=0|TP=10|ip-26-0-173-202]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default2]:07/03/2024 03:59:13 [WARNING|DP=2|PP=0|TP=10|ip-26-0-173-202]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default2]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default2]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default7]:07/03/2024 03:59:13 [WARNING|DP=0|PP=0|TP=7|ip-26-0-162-233]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default7]:07/03/2024 03:59:13 [WARNING|DP=0|PP=0|TP=7|ip-26-0-162-233]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default7]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default7]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default0]:07/03/2024 03:59:13 [WARNING|DP=1|PP=0|TP=0|ip-26-0-164-207]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default0]:07/03/2024 03:59:13 [WARNING|DP=1|PP=0|TP=0|ip-26-0-164-207]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default0]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default0]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default7]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:07/03/2024 03:59:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 567.45MiB. Peak allocated 10905.06MiB. Peak reserved: 11626.00MiB -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default1]:07/03/2024 03:59:31 [WARNING|DP=1|PP=0|TP=9|ip-26-0-165-24]: Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:07/03/2024 03:59:31 [WARNING|DP=1|PP=0|TP=9|ip-26-0-165-24]: Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default1]:Using the latest cached version of the dataset since roneneldan/TinyStories couldn't be found on the Hugging Face Hub -[default1]:Found the latest cached dataset configuration 'default' at /admin/home/ferdinand_mom/.cache/roneneldan___tiny_stories/default/0.0.0/691b0d9bd48ade766778c940011ca1c549f6359b (last modified on Mon Jun 24 07:59:52 2024). -[default2]:07/03/2024 03:59:49 [WARNING|DP=1|PP=0|TP=2|ip-26-0-164-207]: Repo card metadata block was not found. Setting CardData to empty. -[default2]:Repo card metadata block was not found. Setting CardData to empty. -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.) -[default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default6]: warnings.warn( -[default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default3]: warnings.warn( -[default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default2]: warnings.warn( -[default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default7]: warnings.warn( -[default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default1]: warnings.warn( -[default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default5]: warnings.warn( -[default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default0]: warnings.warn( -[default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:2261: UserWarning: torch.distributed.all_reduce_coalesced will be deprecated. If you must use it, please revisit our documentation later at https://pytorch.org/docs/master/distributed.html#collective-functions -[default4]: warnings.warn( -[default0]:07/03/2024 04:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 60.3K | tokens_per_sec: 69.6K | tokens_per_sec_per_gpu: 1.09K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 0.0001 | model_tflops_per_gpu: 9.86 | hardware_tflops_per_gpu: 9.86 | grad_norm: 11.5 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 12.3G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.1G | hd_free_memory_tb: 246G -[default0]:07/03/2024 04:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 998.04MiB. Peak reserved: 11704.00MiB -[default0]:07/03/2024 04:00:09 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 700.00MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:00:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 7.22K | tokens_per_sec: 581K | tokens_per_sec_per_gpu: 9.07K | global_batch_size: 1.02K | lm_loss: 11.4 | lr: 9.53e-05 | model_tflops_per_gpu: 82.3 | hardware_tflops_per_gpu: 82.3 | grad_norm: 11.6 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 12.3G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.1G | hd_free_memory_tb: 246G -[default0]:07/03/2024 04:00:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 998.05MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:00:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 700.00MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:00:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 6.12K | tokens_per_sec: 686K | tokens_per_sec_per_gpu: 10.7K | global_batch_size: 1.02K | lm_loss: 11.9 | lr: 9.05e-05 | model_tflops_per_gpu: 97.2 | hardware_tflops_per_gpu: 97.2 | grad_norm: 122 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 12.3G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.1G | hd_free_memory_tb: 246G -[default0]:07/03/2024 04:00:16 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 998.05MiB. Peak reserved: 11744.00MiB -[default0]:STAGE:2024-07-03 04:00:16 1668446:1668446 ActivityProfilerController.cpp:314] Completed Stage: Warm Up -[default0]:07/03/2024 04:00:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 700.00MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:00:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 6.97K | tokens_per_sec: 602K | tokens_per_sec_per_gpu: 9.4K | global_batch_size: 1.02K | lm_loss: 12.3 | lr: 8.58e-05 | model_tflops_per_gpu: 85.3 | hardware_tflops_per_gpu: 85.3 | grad_norm: 18.3 | cuda_memory_allocated: 734M | cuda_max_memory_reserved: 12.3G | hd_total_memory_tb: 312G | hd_used_memory_tb: 66.1G | hd_free_memory_tb: 246G -[default0]:07/03/2024 04:00:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 998.05MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:00:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 7.05K | tokens_per_sec: 595K | tokens_per_sec_per_gpu: 9.29K | global_batch_size: 1.02K | lm_loss: 11.2 | lr: 8.11e-05 | model_tflops_per_gpu: 84.3 | hardware_tflops_per_gpu: 84.3 | grad_norm: 29 -[default0]:07/03/2024 04:00:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:00:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 7.04K | tokens_per_sec: 596K | tokens_per_sec_per_gpu: 9.31K | global_batch_size: 1.02K | lm_loss: 10.2 | lr: 7.63e-05 | model_tflops_per_gpu: 84.5 | hardware_tflops_per_gpu: 84.5 | grad_norm: 10.4 -[default0]:STAGE:2024-07-03 04:00:55 1668446:1668446 ActivityProfilerController.cpp:320] Completed Stage: Collection -[default0]:STAGE:2024-07-03 04:00:57 1668446:1668446 ActivityProfilerController.cpp:324] Completed Stage: Post Processing -[default0]:07/03/2024 04:03:13 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:03:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 7 / 20 | consumed_tokens: 29.4M | elapsed_time_per_iteration_ms: 5.72K | tokens_per_sec: 733K | tokens_per_sec_per_gpu: 11.5K | global_batch_size: 1.02K | lm_loss: 9.78 | lr: 7.16e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 9.29 -[default0]:07/03/2024 04:03:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:03:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 8 / 20 | consumed_tokens: 33.6M | elapsed_time_per_iteration_ms: 5.73K | tokens_per_sec: 732K | tokens_per_sec_per_gpu: 11.4K | global_batch_size: 1.02K | lm_loss: 10 | lr: 6.68e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 32.8 -[default0]:07/03/2024 04:03:24 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:03:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 9 / 20 | consumed_tokens: 37.7M | elapsed_time_per_iteration_ms: 5.72K | tokens_per_sec: 733K | tokens_per_sec_per_gpu: 11.5K | global_batch_size: 1.02K | lm_loss: 9.28 | lr: 6.21e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 9.81 -[default0]:07/03/2024 04:03:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:03:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 10 / 20 | consumed_tokens: 41.9M | elapsed_time_per_iteration_ms: 5.72K | tokens_per_sec: 733K | tokens_per_sec_per_gpu: 11.5K | global_batch_size: 1.02K | lm_loss: 9.04 | lr: 5.74e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 7.94 -[default0]:07/03/2024 04:03:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:03:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 11 / 20 | consumed_tokens: 46.1M | elapsed_time_per_iteration_ms: 5.73K | tokens_per_sec: 732K | tokens_per_sec_per_gpu: 11.4K | global_batch_size: 1.02K | lm_loss: 8.8 | lr: 5.26e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 6.87 -[default0]:07/03/2024 04:03:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:03:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 12 / 20 | consumed_tokens: 50.3M | elapsed_time_per_iteration_ms: 5.72K | tokens_per_sec: 733K | tokens_per_sec_per_gpu: 11.4K | global_batch_size: 1.02K | lm_loss: 8.53 | lr: 4.79e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 6.54 -[default0]:07/03/2024 04:03:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:03:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 13 / 20 | consumed_tokens: 54.5M | elapsed_time_per_iteration_ms: 5.74K | tokens_per_sec: 731K | tokens_per_sec_per_gpu: 11.4K | global_batch_size: 1.02K | lm_loss: 8.26 | lr: 4.32e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 6.14 -[default0]:07/03/2024 04:03:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:03:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 14 / 20 | consumed_tokens: 58.7M | elapsed_time_per_iteration_ms: 5.73K | tokens_per_sec: 732K | tokens_per_sec_per_gpu: 11.4K | global_batch_size: 1.02K | lm_loss: 8.04 | lr: 3.84e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 5.51 -[default0]:07/03/2024 04:03:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:04:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 15 / 20 | consumed_tokens: 62.9M | elapsed_time_per_iteration_ms: 5.73K | tokens_per_sec: 732K | tokens_per_sec_per_gpu: 11.4K | global_batch_size: 1.02K | lm_loss: 7.94 | lr: 3.37e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 6.09 -[default0]:07/03/2024 04:04:05 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:04:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 16 / 20 | consumed_tokens: 67.1M | elapsed_time_per_iteration_ms: 6.17K | tokens_per_sec: 680K | tokens_per_sec_per_gpu: 10.6K | global_batch_size: 1.02K | lm_loss: 7.88 | lr: 2.89e-05 | model_tflops_per_gpu: 96.4 | hardware_tflops_per_gpu: 96.4 | grad_norm: 6.89 -[default0]:07/03/2024 04:04:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:04:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 17 / 20 | consumed_tokens: 71.3M | elapsed_time_per_iteration_ms: 6.38K | tokens_per_sec: 658K | tokens_per_sec_per_gpu: 10.3K | global_batch_size: 1.02K | lm_loss: 7.74 | lr: 2.42e-05 | model_tflops_per_gpu: 93.3 | hardware_tflops_per_gpu: 93.3 | grad_norm: 5.78 -[default0]:07/03/2024 04:04:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:04:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 18 / 20 | consumed_tokens: 75.5M | elapsed_time_per_iteration_ms: 5.73K | tokens_per_sec: 732K | tokens_per_sec_per_gpu: 11.4K | global_batch_size: 1.02K | lm_loss: 7.61 | lr: 1.95e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 4.59 -[default0]:07/03/2024 04:04:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:04:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 19 / 20 | consumed_tokens: 79.7M | elapsed_time_per_iteration_ms: 5.74K | tokens_per_sec: 731K | tokens_per_sec_per_gpu: 11.4K | global_batch_size: 1.02K | lm_loss: 7.52 | lr: 1.47e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 4.5 -[default0]:07/03/2024 04:04:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: Memory usage: 699.99MiB. Peak allocated 11037.61MiB. Peak reserved: 11744.00MiB -[default0]:07/03/2024 04:04:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-162-233]: iteration: 20 / 20 | consumed_tokens: 83.9M | elapsed_time_per_iteration_ms: 5.72K | tokens_per_sec: 733K | tokens_per_sec_per_gpu: 11.5K | global_batch_size: 1.02K | lm_loss: 7.46 | lr: 1e-05 | model_tflops_per_gpu: 104 | hardware_tflops_per_gpu: 104 | grad_norm: 4.57 -W0703 04:05:06.843000 140096564365120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_904569_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:05:06.844000 140400142636864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_415102_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:05:06.848000 140400142636864 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-164-207.ec2.internal_415102_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -W0703 04:05:06.848000 140096564365120 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-165-24.ec2.internal_904569_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError. -Saved 1 csv files over 1 completed logs -Processing file: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/profiler/ip-26-0-162-233_1668446.1719979363921033940.pt.trace.json -Results written to /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/64_GPUS/dp-4_tp-16_pp-1_mbz-8/profiler.csv -Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. - ip-26-0-162-233_1668446.1719979363921033940.pt.trace.json: 0%| | 0.00/4.52G [00:00